diff options
author | Unit 193 <unit193@unit193.net> | 2020-10-25 17:59:29 -0400 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2020-10-25 17:59:29 -0400 |
commit | 6c77ac67811ca061b022e9677f3ef365625b0f3e (patch) | |
tree | 43606f6d2894d5211b1f8e0456a0e0162684e444 | |
parent | fa22dd3889bb4b898017195e13eb15ba0431255e (diff) | |
parent | 5dc7d6f5902ddaee5223d041d5c10060f0c72430 (diff) | |
download | gallery-dl-6c77ac67811ca061b022e9677f3ef365625b0f3e.tar.bz2 gallery-dl-6c77ac67811ca061b022e9677f3ef365625b0f3e.tar.xz gallery-dl-6c77ac67811ca061b022e9677f3ef365625b0f3e.tar.zst |
Update upstream source from tag 'upstream/1.15.2'
Update to upstream version '1.15.2'
with Debian dir b11433f3e7944c55987dcada15dce64a82e74ce6
94 files changed, 632 insertions, 477 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a55546..f382013 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## 1.15.2 - 2020-10-24 +### Additions +- [pinterest] implement login support ([#1055](https://github.com/mikf/gallery-dl/issues/1055)) +- [reddit] add `date` metadata field ([#1068](https://github.com/mikf/gallery-dl/issues/1068)) +- [seiga] add metadata for single image downloads ([#1063](https://github.com/mikf/gallery-dl/issues/1063)) +- [twitter] support media from Cards ([#937](https://github.com/mikf/gallery-dl/issues/937), [#1005](https://github.com/mikf/gallery-dl/issues/1005)) +- [weasyl] support api-key authentication ([#1057](https://github.com/mikf/gallery-dl/issues/1057)) +- add a `t` format string conversion for trimming whitespace ([#1065](https://github.com/mikf/gallery-dl/issues/1065)) +### Fixes +- [blogger] handle URLs with specified width/height ([#1061](https://github.com/mikf/gallery-dl/issues/1061)) +- [fallenangels] fix extraction of `.5` chapters +- [gelbooru] rewrite mp4 video URLs ([#1048](https://github.com/mikf/gallery-dl/issues/1048)) +- [hitomi] fix image URLs and gallery URL pattern +- [mangadex] unescape more metadata fields ([#1066](https://github.com/mikf/gallery-dl/issues/1066)) +- [mangahere] ensure download URLs have a scheme ([#1070](https://github.com/mikf/gallery-dl/issues/1070)) +- [mangakakalot] ignore "Go Home" buttons in chapter pages +- [newgrounds] handle embeds without scheme ([#1033](https://github.com/mikf/gallery-dl/issues/1033)) +- [newgrounds] provide fallback URLs for video downloads ([#1042](https://github.com/mikf/gallery-dl/issues/1042)) +- [xhamster] fix user profile extraction + ## 1.15.1 - 2020-10-11 ### Additions - [hentaicafe] add `manga_id` metadata field ([#1036](https://github.com/mikf/gallery-dl/issues/1036)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.15.1 +Version: 1.15.2 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.1/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.1/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -225,8 +225,8 @@ Description: ========== ``pixiv``, ``nijie``, and ``seiga`` and optional for ``aryion``, ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, - ``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, - and ``twitter``. + ``instagram``, ``luscious``, ``pinterest``, ``sankaku``, ``subscribestar``, + ``tsumino``, and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -319,7 +319,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.1.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.1/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.1/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -214,8 +214,8 @@ a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` and optional for ``aryion``, ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, -``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, -and ``twitter``. +``instagram``, ``luscious``, ``pinterest``, ``sankaku``, ``subscribestar``, +``tsumino``, and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -308,7 +308,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.1.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index cbcf4bf..9df67f4 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-10-11" "1.15.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2020-10-24" "1.15.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index aeecaa0..8dd3187 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-10-11" "1.15.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2020-10-24" "1.15.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -325,6 +325,8 @@ and optional for .br * \f[I]luscious\f[] .br +* \f[I]pinterest\f[] +.br * \f[I]sankaku\f[] .br * \f[I]subscribestar\f[] @@ -333,13 +335,12 @@ and optional for .br * \f[I]twitter\f[] -These values can also be set via the \f[I]-u/--username\f[] and -\f[I]-p/--password\f[] command-line options or by using a \f[I].netrc\f[] file. -(see Authentication_) +These values can also be specified via the +\f[I]-u/--username\f[] and \f[I]-p/--password\f[] command-line options or +by using a \f[I].netrc\f[] file. (see Authentication_) -Note: The password values for \f[I]danbooru\f[] and \f[I]e621\f[] should be -the API keys found in your user profile, not your actual account -password. +Note: The password value for \f[I]danbooru\f[] and \f[I]e621\f[] should be +the API key found in your user profile, not the actual account password. .SS extractor.*.netrc @@ -370,7 +371,7 @@ Source to read additional cookies from. Either as Example: -.. code:: +.. code:: json { "cookie-name": "cookie-value", @@ -414,10 +415,10 @@ See \f[I]Requests' proxy documentation\f[] for more details. Example: -.. code:: +.. code:: json { -"http": "http://10.10.1.10:3128", +"http" : "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080", "http://10.20.1.128": "http://10.10.1.10:5323" } @@ -533,18 +534,43 @@ An alternative \f[I]format string\f[] to build archive IDs with. \f[I]list\f[] of \f[I]Postprocessor Configuration\f[] objects .IP "Example:" 4 -.. code:: +.. code:: json [ -{"name": "zip", "compression": "zip"}, -{"name": "exec", "command": ["/home/foobar/script", "{category}", "{image_id}"]} +{ +"name": "zip" , +"compression": "store" +}, +{ +"name": "exec", +"command": ["/home/foobar/script", "{category}", "{image_id}"] +} ] .IP "Description:" 4 -A list of \f[I]post-processors\f[] +A list of \f[I]post processors\f[] to be applied to each downloaded file in the specified order. +Unlike other options, a \f[I]postprocessors\f[] setting at a deeper level +.br +does not override any \f[I]postprocessors\f[] setting at a lower level. +Instead, all post processors from all applicable \f[I]postprocessors\f[] +.br +settings get combined into a single list. + +For example + +.br +* an \f[I]mtime\f[] post processor at \f[I]extractor.postprocessors\f[], +.br +* a \f[I]zip\f[] post processor at \f[I]extractor.pixiv.postprocessors\f[], +.br +* and using \f[I]--exec\f[] + +will run all three post processors - \f[I]mtime\f[], \f[I]zip\f[], \f[I]exec\f[] - +for each downloaded \f[I]pixiv\f[] file. + .SS extractor.*.retries .IP "Type:" 6 @@ -555,7 +581,7 @@ to be applied to each downloaded file in the specified order. .IP "Description:" 4 Maximum number of times a failed HTTP request is retried before -giving up or \f[I]-1\f[] for infinite retries. +giving up, or \f[I]-1\f[] for infinite retries. .SS extractor.*.timeout @@ -1596,6 +1622,17 @@ Possible types are \f[I]text\f[], \f[I]quote\f[], \f[I]link\f[], \f[I]answer\f[] You can use \f[I]"all"\f[] instead of listing all types separately. +.SS extractor.twitter.cards +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Fetch media from \f[I]Cards\f[]. + + .SS extractor.twitter.quoted .IP "Type:" 6 \f[I]bool\f[] @@ -1680,12 +1717,24 @@ Download video files. \f[I]null\f[] .IP "Description:" 4 -Your \f[I]API Key\f[] to use -your account's browsing settings and default filters when searching. +Your \f[I]Wallhaven API Key\f[], +to use your account's browsing settings and default filters when searching. See https://wallhaven.cc/help/api for more information. +.SS extractor.weasyl.api-key +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Your \f[I]Weasyl API Key\f[], +to use your account's browsing settings and filters. + + .SS extractor.weibo.retweets .IP "Type:" 6 \f[I]bool\f[] @@ -1946,7 +1995,7 @@ cause unexpected results in combination with other options \f[I]object\f[] .IP "Example:" 4 -.. code:: +.. code:: json { "quiet": true, @@ -2075,13 +2124,13 @@ before outputting them as JSON. \f[I]object\f[] .IP "Default:" 9 -.. code:: +.. code:: json { -"Pictures" : ["jpg", "jpeg", "png", "gif", "bmp", "svg", "webp"], -"Video" : ["flv", "ogv", "avi", "mp4", "mpg", "mpeg", "3gp", "mkv", "webm", "vob", "wmv"], -"Music" : ["mp3", "aac", "flac", "ogg", "wma", "m4a", "wav"], -"Archives" : ["zip", "rar", "7z", "tar", "gz", "bz2"] +"Pictures": ["jpg", "jpeg", "png", "gif", "bmp", "svg", "webp"], +"Video" : ["flv", "ogv", "avi", "mp4", "mpg", "mpeg", "3gp", "mkv", "webm", "vob", "wmv"], +"Music" : ["mp3", "aac", "flac", "ogg", "wma", "m4a", "wav"], +"Archives": ["zip", "rar", "7z", "tar", "gz", "bz2"] } @@ -2489,11 +2538,11 @@ Submission Policy, and Terms of Service. application and put them in your configuration file as \f[I]"client-id"\f[] and \f[I]"client-secret"\f[] .br -* clear your \f[I]cache\f[] (\f[I]--clear-cache\f[]) to delete -the \f[I]access-token\f[] from the previous \f[I]client-id\f[] +* clear your \f[I]cache\f[] to delete any remaining +\f[I]access-token\f[] entries. (\f[I]gallery-dl --clear-cache\f[]) .br -* get a new \f[I]refresh-token\f[] -if necessary +* get a new \f[I]refresh-token\f[] for the +new \f[I]client-id\f[] (\f[I]gallery-dl oauth:deviantart\f[]) .SS extractor.flickr.api-key & .api-secret @@ -2636,19 +2685,19 @@ The path \f[I]C:\\path\\to\\file.ext\f[] has therefore to be written as \f[I]object\f[] .IP "Example:" 4 -.. code:: +.. code:: json { -"format": "{asctime} {name}: {message}", +"format" : "{asctime} {name}: {message}", "format-date": "%H:%M:%S", -"path": "~/log.txt", -"encoding": "ascii" +"path" : "~/log.txt", +"encoding" : "ascii" } -.. code:: +.. code:: json { -"level": "debug", +"level" : "debug", "format": { "debug" : "debug: {message}", "info" : "[{name}] {message}", @@ -2710,7 +2759,7 @@ use \f[I]"w"\f[] to truncate or \f[I]"a"\f[] to append .br * Default: \f[I]"utf-8"\f[] -Note: path, mode and encoding are only applied when configuring +Note: path, mode, and encoding are only applied when configuring logging output to a file. @@ -2719,17 +2768,17 @@ logging output to a file. \f[I]object\f[] .IP "Example:" 4 -.. code:: +.. code:: json { "name": "mtime" } -.. code:: +.. code:: json { -"name": "zip", +"name" : "zip", "compression": "store", -"extension": "cbz", -"whitelist": ["mangadex", "exhentai", "nhentai"] +"extension" : "cbz", +"whitelist" : ["mangadex", "exhentai", "nhentai"] } .IP "Description:" 4 diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 3207269..18f8d82 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.15.1 +Version: 1.15.2 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.1/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.1/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.2/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -225,8 +225,8 @@ Description: ========== ``pixiv``, ``nijie``, and ``seiga`` and optional for ``aryion``, ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``, - ``instagram``, ``luscious``, ``sankaku``, ``subscribestar``, ``tsumino``, - and ``twitter``. + ``instagram``, ``luscious``, ``pinterest``, ``sankaku``, ``subscribestar``, + ``tsumino``, and ``twitter``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -319,7 +319,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.1.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.2.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index e33aa2d..edb9d46 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -101,7 +101,7 @@ class _35photoUserExtractor(_35photoExtractor): """Extractor for all images of a user on 35photo.pro""" subcategory = "user" pattern = (r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro" - r"/(?!photo_|genre_|tags/|rating/)([^/?&#]+)") + r"/(?!photo_|genre_|tags/|rating/)([^/?#]+)") test = ( ("https://35photo.pro/liya", { "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg", @@ -142,7 +142,7 @@ class _35photoTagExtractor(_35photoExtractor): subcategory = "tag" directory_fmt = ("{category}", "Tags", "{search_tag}") archive_fmt = "t{search_tag}_{id}_{num}" - pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/tags/([^/?&#]+)" + pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/tags/([^/?#]+)" test = ("https://35photo.pro/tags/landscape/", { "range": "1-25", "count": 25, diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py index 980dc20..bed30b1 100644 --- a/gallery_dl/extractor/4chan.py +++ b/gallery_dl/extractor/4chan.py @@ -65,7 +65,7 @@ class _4chanBoardExtractor(Extractor): """Extractor for 4chan boards""" category = "4chan" subcategory = "board" - pattern = r"(?:https?://)?boards\.4chan(?:nel)?\.org/([^/?&#]+)/\d*$" + pattern = r"(?:https?://)?boards\.4chan(?:nel)?\.org/([^/?#]+)/\d*$" test = ("https://boards.4channel.org/po/", { "pattern": _4chanThreadExtractor.pattern, "count": ">= 100", diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index fd973c3..624b14d 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -90,7 +90,7 @@ class _500pxExtractor(Extractor): class _500pxUserExtractor(_500pxExtractor): """Extractor for photos from a user's photostream on 500px.com""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!photo/)(?:p/)?([^/?&#]+)/?(?:$|\?|#)" + pattern = BASE_PATTERN + r"/(?!photo/)(?:p/)?([^/?#]+)/?(?:$|[?#])" test = ( ("https://500px.com/p/light_expression_photography", { "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D4096/v2", @@ -132,7 +132,7 @@ class _500pxGalleryExtractor(_500pxExtractor): subcategory = "gallery" directory_fmt = ("{category}", "{user[username]}", "{gallery[name]}") pattern = (BASE_PATTERN + r"/(?!photo/)(?:p/)?" - r"([^/?&#]+)/galleries/([^/?&#]+)") + r"([^/?#]+)/galleries/([^/?#]+)") test = ( ("https://500px.com/p/fashvamp/galleries/lera", { "url": "002dc81dee5b4a655f0e31ad8349e8903b296df6", diff --git a/gallery_dl/extractor/8kun.py b/gallery_dl/extractor/8kun.py index 7162920..47fe672 100644 --- a/gallery_dl/extractor/8kun.py +++ b/gallery_dl/extractor/8kun.py @@ -64,7 +64,7 @@ class _8kunBoardExtractor(Extractor): """Extractor for 8kun boards""" category = "8kun" subcategory = "board" - pattern = r"(?:https?://)?8kun\.top/([^/?&#]+)/(?:index|\d+)\.html" + pattern = r"(?:https?://)?8kun\.top/([^/?#]+)/(?:index|\d+)\.html" test = ( ("https://8kun.top/v/index.html", { "pattern": _8kunThreadExtractor.pattern, diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index b248735..3eb5565 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -22,7 +22,7 @@ class _8musesAlbumExtractor(Extractor): archive_fmt = "{hash}" root = "https://comics.8muses.com" pattern = (r"(?:https?://)?(?:comics\.|www\.)?8muses\.com" - r"(/comics/album/[^?&#]+)(\?[^#]+)?") + r"(/comics/album/[^?#]+)(\?[^#]+)?") test = ( ("https://comics.8muses.com/comics/album/Fakku-Comics/mogg/Liar", { "url": "6286ac33087c236c5a7e51f8a9d4e4d5548212d4", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 985ad48..6914f24 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -131,7 +131,7 @@ class ArtstationUserExtractor(ArtstationExtractor): """Extractor for all projects of an artstation user""" subcategory = "user" pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com" - r"/(?!artwork|projects|search)([^/?&#]+)(?:/albums/all)?" + r"/(?!artwork|projects|search)([^/?#]+)(?:/albums/all)?" r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$") test = ( ("https://www.artstation.com/gaerikim/", { @@ -156,7 +156,7 @@ class ArtstationAlbumExtractor(ArtstationExtractor): "{album[id]} - {album[title]}") archive_fmt = "a_{album[id]}_{asset[id]}" pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com" - r"/(?!artwork|projects|search)([^/?&#]+)" + r"/(?!artwork|projects|search)([^/?#]+)" r"|((?!www)\w+)\.artstation\.com)/albums/(\d+)") test = ( ("https://www.artstation.com/huimeiye/albums/770899", { @@ -199,7 +199,7 @@ class ArtstationLikesExtractor(ArtstationExtractor): directory_fmt = ("{category}", "{userinfo[username]}", "Likes") archive_fmt = "f_{userinfo[id]}_{asset[id]}" pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" - r"/(?!artwork|projects|search)([^/?&#]+)/likes/?") + r"/(?!artwork|projects|search)([^/?#]+)/likes/?") test = ( ("https://www.artstation.com/mikf/likes", { "pattern": r"https://\w+\.artstation\.com/p/assets" @@ -225,7 +225,7 @@ class ArtstationChallengeExtractor(ArtstationExtractor): "{challenge[id]} - {challenge[title]}") archive_fmt = "c_{challenge[id]}_{asset_id}" pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" - r"/contests/[^/?&#]+/challenges/(\d+)" + r"/contests/[^/?#]+/challenges/(\d+)" r"/?(?:\?sorting=([a-z]+))?") test = ( ("https://www.artstation.com/contests/thu-2017/challenges/20"), @@ -386,7 +386,7 @@ class ArtstationFollowingExtractor(ArtstationExtractor): """Extractor for a user's followed users""" subcategory = "following" pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" - r"/(?!artwork|projects|search)([^/?&#]+)/following") + r"/(?!artwork|projects|search)([^/?#]+)/following") test = ("https://www.artstation.com/gaerikim/following", { "pattern": ArtstationUserExtractor.pattern, "count": ">= 50", diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 374a9fc..6a90b76 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -153,7 +153,7 @@ class AryionGalleryExtractor(AryionExtractor): """Extractor for a user's gallery on eka's portal""" subcategory = "gallery" categorytransfer = True - pattern = BASE_PATTERN + r"/(?:gallery/|user/|latest.php\?name=)([^/?&#]+)" + pattern = BASE_PATTERN + r"/(?:gallery/|user/|latest.php\?name=)([^/?#]+)" test = ( ("https://aryion.com/g4/gallery/jameshoward", { "options": (("recursive", False),), diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index c3049a4..ec7020a 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -108,7 +108,7 @@ class BcyUserExtractor(BcyExtractor): test = ( ("https://bcy.net/u/1933712", { "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg", - "count": ">= 25", + "count": ">= 20", }), ("https://bcy.net/u/109282764041", { "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index be498bc..a817174 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -151,7 +151,7 @@ class BehanceUserExtractor(BehanceExtractor): """Extractor for a user's galleries from www.behance.net""" subcategory = "user" categorytransfer = True - pattern = r"(?:https?://)?(?:www\.)?behance\.net/([^/?&#]+)/?$" + pattern = r"(?:https?://)?(?:www\.)?behance\.net/([^/?#]+)/?$" test = ("https://www.behance.net/alexstrohl", { "count": ">= 8", "pattern": BehanceGalleryExtractor.pattern, diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 9c18e0e..60170dc 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -42,7 +42,7 @@ class BloggerExtractor(Extractor): blog["date"] = text.parse_datetime(blog["published"]) del blog["selfLink"] - sub = re.compile(r"/s\d+/").sub + sub = re.compile(r"/(?:s\d+|w\d+-h\d+)/").sub findall_image = re.compile( r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall findall_video = re.compile( @@ -92,7 +92,7 @@ class BloggerExtractor(Extractor): class BloggerPostExtractor(BloggerExtractor): """Extractor for a single blog post""" subcategory = "post" - pattern = BASE_PATTERN + r"(/\d{4}/\d\d/[^/?&#]+\.html)" + pattern = BASE_PATTERN + r"(/\d{4}/\d\d/[^/?#]+\.html)" test = ( ("https://julianbphotography.blogspot.com/2010/12/moon-rise.html", { "url": "9928429fb62f712eb4de80f53625eccecc614aae", @@ -134,6 +134,10 @@ class BloggerPostExtractor(BloggerExtractor): "cfnm-scene-jenna-fischer-in-office.html"), { "pattern": r"https://.+\.googlevideo\.com/videoplayback", }), + # image URLs with width/height (#1061) + ("https://aaaninja.blogspot.com/2020/08/altera-boob-press-2.html", { + "pattern": r"https://1.bp.blogspot.com/.+/s0/altera_.+png", + }), ) def __init__(self, match): @@ -167,7 +171,7 @@ class BloggerBlogExtractor(BloggerExtractor): class BloggerSearchExtractor(BloggerExtractor): """Extractor for search resuls and labels""" subcategory = "search" - pattern = BASE_PATTERN + r"/search(?:/?\?q=([^/?&#]+)|/label/([^/?&#]+))" + pattern = BASE_PATTERN + r"/search(?:/?\?q=([^/?#]+)|/label/([^/?#]+))" test = ( ("https://julianbphotography.blogspot.com/search?q=400mm", { "count": "< 10" diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index be0027a..0176d76 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -51,7 +51,7 @@ class BooruExtractor(SharedConfigMixin, Extractor): for image in images: try: - url = image["file_url"] + url = self.get_file_url(image) except KeyError: continue if url.startswith("/"): @@ -86,6 +86,10 @@ class BooruExtractor(SharedConfigMixin, Extractor): """Collect metadata for extractor-job""" return {} + @staticmethod + def get_file_url(image): + return image["file_url"] + def extended_tags(self, image, page=None): """Retrieve extended tag information""" if not page: diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 9cceaee..e40ec51 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -460,7 +460,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): subcategory = "folder" directory_fmt = ("{category}", "{username}", "{folder[title]}") archive_fmt = "F_{folder[uuid]}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery/(\d+)/([^/?&#]+)" + pattern = BASE_PATTERN + r"/gallery/(\d+)/([^/?#]+)" test = ( # user ("https://www.deviantart.com/shimoda7/gallery/722019/Miscellaneous", { @@ -601,7 +601,7 @@ class DeviantartCollectionExtractor(DeviantartExtractor): directory_fmt = ("{category}", "{username}", "Favourites", "{collection[title]}") archive_fmt = "C_{collection[uuid]}_{index}.{extension}" - pattern = BASE_PATTERN + r"/favourites/(\d+)/([^/?&#]+)" + pattern = BASE_PATTERN + r"/favourites/(\d+)/([^/?#]+)" test = ( (("https://www.deviantart.com/pencilshadings" "/favourites/70595441/3D-Favorites"), { @@ -671,8 +671,8 @@ class DeviantartPopularExtractor(DeviantartExtractor): archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" pattern = (r"(?:https?://)?www\.deviantart\.com/(?:" r"search(?:/deviations)?" - r"|(?:deviations/?)?\?order=(popular-[^/?&#]+)" - r"|((?:[\w-]+/)*)(popular-[^/?&#]+)" + r"|(?:deviations/?)?\?order=(popular-[^/?#]+)" + r"|((?:[\w-]+/)*)(popular-[^/?#]+)" r")/?(?:\?([^#]*))?") test = ( ("https://www.deviantart.com/?order=popular-all-time", { @@ -730,7 +730,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor): """Extractor for single deviations""" subcategory = "deviation" archive_fmt = "{index}.{extension}" - pattern = BASE_PATTERN + r"/(art|journal)/(?:[^/?&#]+-)?(\d+)" + pattern = BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)" test = ( (("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), { "options": (("original", 0),), diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 1d17658..a6346bf 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,7 +17,7 @@ class DirectlinkExtractor(Extractor): category = "directlink" filename_fmt = "{domain}/{path}/{filename}.{extension}" archive_fmt = filename_fmt - pattern = (r"(?i)https?://(?P<domain>[^/?&#]+)/(?P<path>[^?&#]+\." + pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\." r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$") test = ( diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 9cc6738..7d26c47 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -44,7 +44,7 @@ class DynastyscansBase(): class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): """Extractor for manga-chapters from dynasty-scans.com""" - pattern = BASE_PATTERN + r"(/chapters/[^/?&#]+)" + pattern = BASE_PATTERN + r"(/chapters/[^/?#]+)" test = ( (("http://dynasty-scans.com/chapters/" "hitoribocchi_no_oo_seikatsu_ch33"), { diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index 44863a9..ab0e0c5 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -17,7 +17,7 @@ class FallenangelsChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from fascans.com""" category = "fallenangels" pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com" - r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?") + r"/manga/([^/?#]+)/([^/?#]+)") test = ( ("https://manga.fascans.com/manga/chronos-ruler/20/1", { "url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3", @@ -28,12 +28,13 @@ class FallenangelsChapterExtractor(ChapterExtractor): "keyword": "2bdb7334c0e3eceb9946ffd3132df679b4a94f6a", }), ("http://manga.fascans.com/manga/rakudai-kishi-no-eiyuutan/19.5", { - "keyword": "9fcca4c1a90d11f00764f62477ebe10bd408021c", + "url": "273f6863966c83ea79ad5846a2866e08067d3f0e", + "keyword": "d1065685bfe0054c4ff2a0f20acb089de4cec253", }), ) def __init__(self, match): - self.version, self.manga, self.chapter, self.minor = match.groups() + self.version, self.manga, self.chapter = match.groups() url = "https://{}.fascans.com/manga/{}/{}/1".format( self.version, self.manga, self.chapter) ChapterExtractor.__init__(self, match, url) @@ -41,11 +42,12 @@ class FallenangelsChapterExtractor(ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) lang = "vi" if self.version == "truyen" else "en" + chapter, sep, minor = self.chapter.partition(".") return { "manga" : extr('name="description" content="', ' Chapter '), "title" : extr(': ', ' - Page 1'), - "chapter" : self.chapter, - "chapter_minor": self.minor or "", + "chapter" : chapter, + "chapter_minor": sep + minor, "lang" : lang, "language": util.code_to_language(lang), } diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index bf925b6..4245617 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -45,7 +45,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): """Base class for chapter extractors for FoOlSlide based sites""" directory_fmt = ("{category}", "{manga}", "{chapter_string}") archive_fmt = "{id}" - pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" + pattern_fmt = r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" decode = "default" def items(self): @@ -86,7 +86,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): """Base class for manga extractors for FoOlSlide based sites""" - pattern_fmt = r"(/series/[^/?&#]+)" + pattern_fmt = r"(/series/[^/?#]+)" def chapters(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 2a5ef6e..752cd62 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -177,7 +177,7 @@ class FuraffinityExtractor(Extractor): class FuraffinityGalleryExtractor(FuraffinityExtractor): """Extractor for a furaffinity user's gallery""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/gallery/([^/?&#]+)" + pattern = BASE_PATTERN + r"/gallery/([^/?#]+)" test = ("https://www.furaffinity.net/gallery/mirlinthloth/", { "pattern": r"https://d\d?.facdn.net/art/mirlinthloth/\d+/\d+.\w+\.\w+", "range": "45-50", @@ -189,7 +189,7 @@ class FuraffinityScrapsExtractor(FuraffinityExtractor): """Extractor for a furaffinity user's scraps""" subcategory = "scraps" directory_fmt = ("{category}", "{user!l}", "Scraps") - pattern = BASE_PATTERN + r"/scraps/([^/?&#]+)" + pattern = BASE_PATTERN + r"/scraps/([^/?#]+)" test = ("https://www.furaffinity.net/scraps/mirlinthloth/", { "pattern": r"https://d\d?.facdn.net/art/[^/]+(/stories)?/\d+/\d+.\w+.", "count": ">= 3", @@ -200,7 +200,7 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): """Extractor for a furaffinity user's favorites""" subcategory = "favorite" directory_fmt = ("{category}", "{user!l}", "Favorites") - pattern = BASE_PATTERN + r"/favorites/([^/?&#]+)" + pattern = BASE_PATTERN + r"/favorites/([^/?#]+)" test = ("https://www.furaffinity.net/favorites/mirlinthloth/", { "pattern": r"https://d\d?.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+", "range": "45-50", @@ -278,7 +278,7 @@ class FuraffinityUserExtractor(FuraffinityExtractor): """Extractor for furaffinity user profiles""" subcategory = "user" cookiedomain = None - pattern = BASE_PATTERN + r"/user/([^/?&#]+)" + pattern = BASE_PATTERN + r"/user/([^/?#]+)" test = ( ("https://www.furaffinity.net/user/mirlinthloth/", { "pattern": r"/gallery/mirlinthloth/$", @@ -302,7 +302,7 @@ class FuraffinityUserExtractor(FuraffinityExtractor): class FuraffinityFollowingExtractor(FuraffinityExtractor): """Extractor for a furaffinity user's watched users""" subcategory = "following" - pattern = BASE_PATTERN + "/watchlist/by/([^/?&#]+)" + pattern = BASE_PATTERN + "/watchlist/by/([^/?#]+)" test = ("https://www.furaffinity.net/watchlist/by/mirlinthloth/", { "pattern": FuraffinityUserExtractor.pattern, "range": "176-225", diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py index eba1c39..df55061 100644 --- a/gallery_dl/extractor/fuskator.py +++ b/gallery_dl/extractor/fuskator.py @@ -17,7 +17,7 @@ class FuskatorGalleryExtractor(GalleryExtractor): """Extractor for image galleries on fuskator.com""" category = "fuskator" root = "https://fuskator.com" - pattern = r"(?:https?://)?fuskator\.com/(?:thumbs|expanded)/([^/?&#]+)" + pattern = r"(?:https?://)?fuskator\.com/(?:thumbs|expanded)/([^/?#]+)" test = ( ("https://fuskator.com/thumbs/d0GnIzXrSKU/", { "pattern": r"https://i\d+.fuskator.com/large/d0GnIzXrSKU/.+\.jpg", diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index edadd31..c32ba5c 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -33,6 +33,15 @@ class GelbooruExtractor(booru.XmlParserMixin, self.session.cookies["fringeBenefits"] = "yup" self.per_page = 42 + @staticmethod + def get_file_url(image): + url = image["file_url"] + if url.startswith("https://mp4.gelbooru.com/"): + ihash = image["md5"] + return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format( + ihash[0:2], ihash[2:4], ihash) + return url + def items_noapi(self): yield Message.Version, 1 data = self.get_metadata() diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index ba2fe5d..493c1d2 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -56,7 +56,7 @@ class GfycatUserExtractor(GfycatExtractor): """Extractor for gfycat user profiles""" subcategory = "user" directory_fmt = ("{category}", "{userName}") - pattern = r"(?:https?://)?gfycat\.com/@([^/?&#]+)" + pattern = r"(?:https?://)?gfycat\.com/@([^/?#]+)" test = ("https://gfycat.com/@gretta", { "pattern": r"https://giant\.gfycat\.com/[A-Za-z]+\.mp4", "count": ">= 100", @@ -70,7 +70,7 @@ class GfycatSearchExtractor(GfycatExtractor): """Extractor for gfycat search results""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") - pattern = r"(?:https?://)?gfycat\.com/gifs/search/([^/?&#]+)" + pattern = r"(?:https?://)?gfycat\.com/gifs/search/([^/?#]+)" test = ("https://gfycat.com/gifs/search/funny+animals", { "pattern": r"https://\w+\.gfycat\.com/[A-Za-z]+\.mp4", "archive": False, diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index 354acbf..53be67b 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -23,7 +23,7 @@ class Hentai2readBase(): class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): """Extractor for a single manga chapter from hentai2read.com""" archive_fmt = "{chapter_id}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+/(\d+))" + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?#]+/(\d+))" test = ("https://hentai2read.com/amazon_elixir/1/", { "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", "keyword": "ff84b8f751f0e4ee37717efc4332ff1db71951d9", @@ -63,7 +63,7 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor): """Extractor for hmanga from hentai2read.com""" chapterclass = Hentai2readChapterExtractor - pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+)/?$" + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?#]+)/?$" test = ( ("https://hentai2read.com/amazon_elixir/", { "url": "273073752d418ec887d7f7211e42b832e8c403ba", diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py index 833135e..e12670a 100644 --- a/gallery_dl/extractor/hentaicafe.py +++ b/gallery_dl/extractor/hentaicafe.py @@ -20,7 +20,7 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): category = "hentaicafe" directory_fmt = ("{category}", "{manga}") pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe" - r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)") + r"(/manga/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)") test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", { "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2", "keyword": "6913608267d883c82b887303b9ced13821188329", @@ -45,7 +45,7 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): """Extractor for manga from hentai.cafe""" category = "hentaicafe" pattern = (r"(?:https?://)?" + r"(?:www\.)?hentai\.cafe" - r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?&#]+)/?$") + r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?#]+)/?$") test = ( # single chapter ("https://hentai.cafe/hazuki-yuuto-summer-blues/", { diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 5eb46b6..0be528d 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -9,7 +9,9 @@ """Extractors for https://www.hentai-foundry.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?hentai-foundry\.com" class HentaifoundryExtractor(Extractor): @@ -21,22 +23,21 @@ class HentaifoundryExtractor(Extractor): root = "https://www.hentai-foundry.com" per_page = 25 - def __init__(self, match, user="", page=1): + def __init__(self, match): Extractor.__init__(self, match) + self.user = match.group(1) self.page_url = "" - self.user = user self.start_post = 0 - self.start_page = text.parse_int(page, 1) + self.start_page = 1 def items(self): - data = self.get_job_metadata() - yield Message.Version, 1 - yield Message.Directory, data + self._init_site_filters() + data = self.metadata() - self.set_filters() - for page_url in util.advance(self._pagination(), self.start_post): - image = self.get_image_metadata(page_url) + for post_url in util.advance(self.posts(), self.start_post): + image = self._parse_post(post_url) image.update(data) + yield Message.Directory, image yield Message.Url, image["src"], image def skip(self, num): @@ -45,24 +46,25 @@ class HentaifoundryExtractor(Extractor): self.start_post += posts return num - def get_job_metadata(self): - """Collect metadata for extractor-job""" - self.request(self.root + "/?enterAgree=1") + def metadata(self): return {"user": self.user} - def _pagination(self, begin='thumbTitle"><a href="', end='"'): + def posts(self): + return self._pagination(self.page_url) + + def _pagination(self, url, begin='thumbTitle"><a href="', end='"'): num = self.start_page while True: - page = self.request("{}/page/{}".format(self.page_url, num)).text + page = self.request("{}/page/{}".format(url, num)).text yield from text.extract_iter(page, begin, end) if 'class="pager"' not in page or 'class="last hidden"' in page: return num += 1 - def get_image_metadata(self, path): - """Collect url and metadata from an image page""" + def _parse_post(self, path): + """Collect url and metadata from an image post""" url = text.urljoin(self.root, path) page = self.request(url).text extr = text.extract_from(page, page.index('id="picBox"')) @@ -89,7 +91,7 @@ class HentaifoundryExtractor(Extractor): return text.nameext_from_url(data["src"], data) - def get_story_metadata(self, html): + def _parse_story(self, html): """Collect url and metadata for a story""" extr = text.extract_from(html) data = { @@ -116,68 +118,66 @@ class HentaifoundryExtractor(Extractor): return text.nameext_from_url(data["src"], data) - def set_filters(self): + def _init_site_filters(self): """Set site-internal filters to show all images""" - token = text.unquote(text.extract( - self.session.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0]) + url = self.root + "/?enterAgree=1" + response = self.request(url, method="HEAD") + + url = self.root + "/site/filters" data = { - "YII_CSRF_TOKEN": token, - "rating_nudity": 3, - "rating_violence": 3, - "rating_profanity": 3, - "rating_racism": 3, - "rating_sex": 3, - "rating_spoilers": 3, - "rating_yaoi": 1, - "rating_yuri": 1, - "rating_teen": 1, - "rating_guro": 1, - "rating_furry": 1, - "rating_beast": 1, - "rating_male": 1, - "rating_female": 1, - "rating_futa": 1, - "rating_other": 1, - "rating_scat": 1, - "rating_incest": 1, - "rating_rape": 1, - "filter_media": "A", - "filter_order": "date_new", - "filter_type": 0, + "rating_nudity" : "3", + "rating_violence" : "3", + "rating_profanity": "3", + "rating_racism" : "3", + "rating_sex" : "3", + "rating_spoilers" : "3", + "rating_yaoi" : "1", + "rating_yuri" : "1", + "rating_teen" : "1", + "rating_guro" : "1", + "rating_furry" : "1", + "rating_beast" : "1", + "rating_male" : "1", + "rating_female" : "1", + "rating_futa" : "1", + "rating_other" : "1", + "rating_scat" : "1", + "rating_incest" : "1", + "rating_rape" : "1", + "filter_media" : "A", + "filter_order" : "date_new", + "filter_type" : "0", + "YII_CSRF_TOKEN" : text.unquote(text.extract( + response.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0]), } - url = self.root + "/site/filters" self.request(url, method="POST", data=data) class HentaifoundryUserExtractor(HentaifoundryExtractor): - """Extractor for all images of a hentai-foundry-user""" + """Extractor for a hentaifoundry user profile""" subcategory = "user" - pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/user/([^/]+)/profile") + pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile" test = ("https://www.hentai-foundry.com/user/Tenpura/profile",) - def __init__(self, match): - HentaifoundryExtractor.__init__(self, match, match.group(1)) - def items(self): + root = self.root user = "/user/" + self.user return self._dispatch_extractors(( (HentaifoundryPicturesExtractor , - self.root + "/pictures" + user), + root + "/pictures" + user), (HentaifoundryScrapsExtractor, - self.root + "/pictures" + user + "/scraps"), + root + "/pictures" + user + "/scraps"), (HentaifoundryStoriesExtractor, - self.root + "/stories" + user), + root + "/stories" + user), (HentaifoundryFavoriteExtractor, - self.root + user + "/faves/pictures"), + root + user + "/faves/pictures"), ), ("pictures",)) class HentaifoundryPicturesExtractor(HentaifoundryExtractor): """Extractor for all pictures of a hentaifoundry user""" subcategory = "pictures" - pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/pictures/user/([^/]+)(?:/page/(\d+))?/?$") + pattern = BASE_PATTERN + r"/pictures/user/([^/?#]+)(?:/page/(\d+))?/?$" test = ( ("https://www.hentai-foundry.com/pictures/user/Tenpura", { "url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28", @@ -186,22 +186,15 @@ class HentaifoundryPicturesExtractor(HentaifoundryExtractor): ) def __init__(self, match): - HentaifoundryExtractor.__init__( - self, match, match.group(1), match.group(2)) + HentaifoundryExtractor.__init__(self, match) self.page_url = "{}/pictures/user/{}".format(self.root, self.user) - def get_job_metadata(self): - page = self.request(self.page_url + "?enterAgree=1").text - count = text.extract(page, ">Pictures (", ")")[0] - return {"user": self.user, "count": text.parse_int(count)} - class HentaifoundryScrapsExtractor(HentaifoundryExtractor): - """Extractor for scrap images of a hentai-foundry-user""" + """Extractor for scraps of a hentaifoundry user""" subcategory = "scraps" directory_fmt = ("{category}", "{user}", "Scraps") - pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/pictures/user/([^/]+)/scraps(?:/page/(\d+))?") + pattern = BASE_PATTERN + r"/pictures/user/([^/?#]+)/scraps" test = ( ("https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", { "url": "7cd9c6ec6258c4ab8c44991f7731be82337492a7", @@ -211,24 +204,17 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor): ) def __init__(self, match): - HentaifoundryExtractor.__init__( - self, match, match.group(1), match.group(2)) + HentaifoundryExtractor.__init__(self, match) self.page_url = "{}/pictures/user/{}/scraps".format( self.root, self.user) - def get_job_metadata(self): - page = self.request(self.page_url + "?enterAgree=1").text - count = text.extract(page, ">Scraps (", ")")[0] - return {"user": self.user, "count": text.parse_int(count)} - class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): - """Extractor for favorite images of a hentai-foundry-user""" + """Extractor for favorite images of a hentaifoundry user""" subcategory = "favorite" directory_fmt = ("{category}", "{user}", "Favorites") archive_fmt = "f_{user}_{index}" - pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/user/([^/]+)/faves/pictures(?:/page/(\d+))?") + pattern = BASE_PATTERN + r"/user/([^/?#]+)/faves/pictures" test = ( ("https://www.hentai-foundry.com/user/Tenpura/faves/pictures", { "url": "56f9ae2e89fe855e9fe1da9b81e5ec6212b0320b", @@ -238,8 +224,7 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): ) def __init__(self, match): - HentaifoundryExtractor.__init__( - self, match, match.group(1), match.group(2)) + HentaifoundryExtractor.__init__(self, match) self.page_url = "{}/user/{}/faves/pictures".format( self.root, self.user) @@ -249,21 +234,18 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor): subcategory = "recent" directory_fmt = ("{category}", "Recent Pictures", "{date}") archive_fmt = "r_{index}" - pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/pictures/recent/(\d+-\d+-\d+)(?:/page/(\d+))?") + pattern = BASE_PATTERN + r"/pictures/recent/(\d\d\d\d-\d\d-\d\d)" test = ("http://www.hentai-foundry.com/pictures/recent/2018-09-20", { - "pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/]+/\d+/", + "pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/?#]+/\d+/", "range": "20-30", }) def __init__(self, match): - HentaifoundryExtractor.__init__(self, match, "", match.group(2)) - self.date = match.group(1) - self.page_url = "{}/pictures/recent/{}".format(self.root, self.date) + HentaifoundryExtractor.__init__(self, match) + self.page_url = "{}/pictures/recent/{}".format(self.root, self.user) - def get_job_metadata(self): - self.request(self.root + "/?enterAgree=1") - return {"date": self.date} + def metadata(self): + return {"date": self.user} class HentaifoundryPopularExtractor(HentaifoundryExtractor): @@ -271,15 +253,14 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor): subcategory = "popular" directory_fmt = ("{category}", "Popular Pictures") archive_fmt = "p_{index}" - pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/pictures/popular(?:/page/(\d+))?") + pattern = BASE_PATTERN + r"/pictures/popular()" test = ("http://www.hentai-foundry.com/pictures/popular", { - "pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/]+/\d+/", + "pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/?#]+/\d+/", "range": "20-30", }) def __init__(self, match): - HentaifoundryExtractor.__init__(self, match, "", match.group(1)) + HentaifoundryExtractor.__init__(self, match) self.page_url = self.root + "/pictures/popular" @@ -287,7 +268,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): """Extractor for a single image from hentaifoundry.com""" subcategory = "image" pattern = (r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com" - r"/(?:pictures/user|[^/])/([^/]+)/(\d+)") + r"/(?:pictures/user|[^/?#])/([^/?#]+)/(\d+)") test = ( (("https://www.hentai-foundry.com" "/pictures/user/Tenpura/407501/shimakaze"), { @@ -309,36 +290,30 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): "width" : 495, }, }), - ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/", { - "exception": exception.HttpError, - }), + ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/"), ("https://pictures.hentai-foundry.com" "/t/Tenpura/407501/Tenpura-407501-shimakaze.png"), ) + skip = Extractor.skip def __init__(self, match): - HentaifoundryExtractor.__init__(self, match, match.group(1)) + HentaifoundryExtractor.__init__(self, match) self.index = match.group(2) def items(self): post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format( self.root, self.user, self.index) - data = self.get_image_metadata(post_url) - data["user"] = self.user - - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, data["src"], data - - def skip(self, _): - return 0 + image = self._parse_post(post_url) + image["user"] = self.user + yield Message.Directory, image + yield Message.Url, image["src"], image class HentaifoundryStoriesExtractor(HentaifoundryExtractor): - """Extractor for stories of a hentai-foundry user""" + """Extractor for stories of a hentaifoundry user""" subcategory = "stories" - pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/stories/user/([^/]+)(?:/page/(\d+))?/?$") + archive_fmt = "s_{index}" + pattern = BASE_PATTERN + r"/stories/user/([^/?#]+)(?:/page/(\d+))?/?$" test = ("https://www.hentai-foundry.com/stories/user/SnowWolf35", { "count": ">= 35", "keyword": { @@ -358,42 +333,37 @@ class HentaifoundryStoriesExtractor(HentaifoundryExtractor): }, }) - def __init__(self, match): - HentaifoundryExtractor.__init__(self, match, match.group(1)) - self.page_url = "{}/stories/user/{}".format(self.root, self.user) - def items(self): - self.get_job_metadata() - self.set_filters() - stories = self._pagination('<div class="storyRow">', '</tr></table>') - for story_html in util.advance(stories, self.start_post): - story = self.get_story_metadata(story_html) + self._init_site_filters() + for story_html in util.advance(self.stories(), self.start_post): + story = self._parse_story(story_html) yield Message.Directory, story yield Message.Url, story["src"], story + def stories(self): + url = "{}/stories/user/{}".format(self.root, self.user) + return self._pagination(url, '<div class="storyRow">', '</tr></table>') + class HentaifoundryStoryExtractor(HentaifoundryExtractor): """Extractor for a hentaifoundry story""" subcategory = "story" - pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/stories/user/([^/]+)/(\d+)") + archive_fmt = "s_{index}" + pattern = BASE_PATTERN + r"/stories/user/([^/?#]+)/(\d+)" test = (("https://www.hentai-foundry.com/stories/user/SnowWolf35" "/26416/Overwatch-High-Chapter-Voting-Location"), { "url": "5a67cfa8c3bf7634c8af8485dd07c1ea74ee0ae8", "keyword": {"title": "Overwatch High Chapter Voting Location"}, }) + skip = Extractor.skip def __init__(self, match): - HentaifoundryExtractor.__init__(self, match, match.group(1)) + HentaifoundryExtractor.__init__(self, match) self.index = match.group(2) def items(self): story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format( self.root, self.user, self.index) - page = self.request(story_url).text - story = self.get_story_metadata(page) + story = self._parse_story(self.request(story_url).text) yield Message.Directory, story yield Message.Url, story["src"], story - - def skip(self, _): - return 0 diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py index 7635bf1..4485925 100644 --- a/gallery_dl/extractor/hentaihand.py +++ b/gallery_dl/extractor/hentaihand.py @@ -74,7 +74,7 @@ class HentaihandTagExtractor(Extractor): root = "https://hentaihand.com" pattern = (r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com" r"/\w+/(parody|character|tag|artist|group|language" - r"|category|relationship)/([^/?&#]+)") + r"|category|relationship)/([^/?#]+)") test = ( ("https://hentaihand.com/en/artist/himuro", { "pattern": HentaihandGalleryExtractor.pattern, diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 1c53723..93ef6f1 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -64,7 +64,7 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for manga chapters from hiperdex.com""" - pattern = BASE_PATTERN + r"(/manga/([^/?&#]+)/([^/?&#]+))" + pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))" test = ( ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", { "pattern": r"https://hiperdex.(com|net|info)/wp-content/uploads" @@ -105,7 +105,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): """Extractor for manga from hiperdex.com""" chapterclass = HiperdexChapterExtractor - pattern = BASE_PATTERN + r"(/manga/([^/?&#]+))/?$" + pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$" test = ( ("https://hiperdex.com/manga/youre-not-that-special/", { "count": 51, @@ -157,7 +157,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): categorytransfer = False chapterclass = HiperdexMangaExtractor reverse = False - pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/([^/?&#]+))" + pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/([^/?#]+))" test = ( ("https://hiperdex.com/manga-artist/beck-ho-an/"), ("https://hiperdex.net/manga-artist/beck-ho-an/"), diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index f341c47..994e1b7 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -21,10 +21,10 @@ class HitomiGalleryExtractor(GalleryExtractor): root = "https://hitomi.la" pattern = (r"(?:https?://)?hitomi\.la" r"/(?:manga|doujinshi|cg|gamecg|galleries|reader)" - r"/(?:[^/?&#]+-)?(\d+)") + r"/(?:[^/?#]+-)?(\d+)") test = ( ("https://hitomi.la/galleries/867789.html", { - "pattern": r"https://[a-c]a.hitomi.la/images/./../[0-9a-f]+.jpg", + "pattern": r"https://[a-c]b.hitomi.la/images/./../[0-9a-f]+.jpg", "keyword": "4873ef9a523621fc857b114e0b2820ba4066e9ae", "count": 16, }), @@ -35,12 +35,12 @@ class HitomiGalleryExtractor(GalleryExtractor): }), # Game CG with scenes (#321) ("https://hitomi.la/galleries/733697.html", { - "url": "b4cbc76032852db4a655bf6a2c4d58eae8153c8e", + "url": "ec3fe9b708ee376ec579b90d053ad485c0777552", "count": 210, }), # fallback for galleries only available through /reader/ URLs ("https://hitomi.la/galleries/1045954.html", { - "url": "f3aa914ad148437f72d307268fa0d250eabe8dab", + "url": "bf4ed4e726204da5bc37a236ca476a2a96081388", "count": 1413, }), # gallery with "broken" redirect @@ -143,7 +143,7 @@ class HitomiGalleryExtractor(GalleryExtractor): frontends = 2 if inum < 0x30 else 3 inum = 1 if inum < 0x09 else inum - url = "https://{}a.hitomi.la/images/{}/{}/{}.{}".format( + url = "https://{}b.hitomi.la/images/{}/{}/{}.{}".format( chr(97 + (inum % frontends)), ihash[-1], ihash[-3:-1], ihash, idata["extension"], @@ -158,7 +158,7 @@ class HitomiTagExtractor(Extractor): subcategory = "tag" pattern = (r"(?:https?://)?hitomi\.la/" r"(tag|artist|group|series|type|character)/" - r"([^/?&#]+)\.html") + r"([^/?#]+)\.html") test = ( ("https://hitomi.la/tag/screenshots-japanese.html", { "pattern": HitomiGalleryExtractor.pattern, diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index bf0ac63..8785f65 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -151,7 +151,7 @@ class ImagefapUserExtractor(ImagefapExtractor): subcategory = "user" categorytransfer = True pattern = (BASE_PATTERN + - r"/(?:profile(?:\.php\?user=|/)([^/?&#]+)" + r"/(?:profile(?:\.php\?user=|/)([^/?#]+)" r"|usergallery\.php\?userid=(\d+))") test = ( ("https://www.imagefap.com/profile/LucyRae/galleries", { diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 4015bfd..ad5a508 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -176,7 +176,7 @@ class ImagetwistImageExtractor(ImagehostImageExtractor): class ImgspiceImageExtractor(ImagehostImageExtractor): """Extractor for single images from imgspice.com""" category = "imgspice" - pattern = r"(?:https?://)?((?:www\.)?imgspice\.com/([^/?&#]+))" + pattern = r"(?:https?://)?((?:www\.)?imgspice\.com/([^/?#]+))" test = ("https://imgspice.com/nwfwtpyog50y/test.png.html", { "url": "b8c30a8f51ee1012959a4cfd46197fabf14de984", "keyword": "100e310a19a2fa22d87e1bbc427ecb9f6501e0c0", @@ -198,7 +198,7 @@ class PixhostImageExtractor(ImagehostImageExtractor): """Extractor for single images from pixhost.to""" category = "pixhost" pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" - r"/show/\d+/(\d+)_[^/?&#]+)") + r"/show/\d+/(\d+)_[^/?#]+)") test = ("http://pixhost.to/show/190/130327671_test-.png", { "url": "4e5470dcf6513944773044d40d883221bbc46cff", "keyword": "3bad6d59db42a5ebbd7842c2307e1c3ebd35e6b0", @@ -218,7 +218,7 @@ class PostimgImageExtractor(ImagehostImageExtractor): """Extractor for single images from postimages.org""" category = "postimg" pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)" - r"/(?:image/)?([^/?&#]+)/?)") + r"/(?:image/)?([^/?#]+)/?)") test = ("https://postimg.cc/Wtn2b3hC", { "url": "0794cfda9b8951a8ac3aa692472484200254ab86", "keyword": "2d05808d04e4e83e33200db83521af06e3147a84", @@ -237,7 +237,7 @@ class TurboimagehostImageExtractor(ImagehostImageExtractor): """Extractor for single images from www.turboimagehost.com""" category = "turboimagehost" pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com" - r"/p/(\d+)/[^/?&#]+\.html)") + r"/p/(\d+)/[^/?#]+\.html)") test = ("https://www.turboimagehost.com/p/39078423/test--.png.html", { "url": "b94de43612318771ced924cb5085976f13b3b90e", "keyword": "704757ca8825f51cec516ec44c1e627c1f2058ca", diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 2a69fb1..5dcca62 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -115,7 +115,7 @@ class ImgbbAlbumExtractor(ImgbbExtractor): """Extractor for albums on imgbb.com""" subcategory = "album" directory_fmt = ("{category}", "{user}", "{album_name} {album_id}") - pattern = r"(?:https?://)?ibb\.co/album/([^/?&#]+)/?(?:\?([^#]+))?" + pattern = r"(?:https?://)?ibb\.co/album/([^/?#]+)/?(?:\?([^#]+))?" test = ( ("https://ibb.co/album/i5PggF", { "range": "1-80", @@ -173,7 +173,7 @@ class ImgbbUserExtractor(ImgbbExtractor): pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$" test = ("https://folkie.imgbb.com", { "range": "1-80", - "pattern": r"https?://i\.ibb\.co/\w+/[^/?&#]+", + "pattern": r"https?://i\.ibb\.co/\w+/[^/?#]+", }) def __init__(self, match): @@ -197,7 +197,7 @@ class ImgbbUserExtractor(ImgbbExtractor): class ImgbbImageExtractor(ImgbbExtractor): subcategory = "image" - pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)" + pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?#]+)" test = ("https://ibb.co/fUqh5b", { "pattern": r"https://i\.ibb\.co/g3kvx80/Arundel-Ireeman-5\.jpg", "content": "c5a0965178a8b357acd8aa39660092918c63795e", diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 4391e64..ae4e606 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -259,7 +259,7 @@ class ImgurGalleryExtractor(ImgurExtractor): class ImgurUserExtractor(ImgurExtractor): """Extractor for all images posted by a user""" subcategory = "user" - pattern = BASE_PATTERN + r"/user/([^/?&#]+)(?:/posts|/submitted)?/?$" + pattern = BASE_PATTERN + r"/user/([^/?#]+)(?:/posts|/submitted)?/?$" test = ( ("https://imgur.com/user/Miguenzo", { "range": "1-100", @@ -277,7 +277,7 @@ class ImgurUserExtractor(ImgurExtractor): class ImgurFavoriteExtractor(ImgurExtractor): """Extractor for a user's favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/user/([^/?&#]+)/favorites" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites" test = ("https://imgur.com/user/Miguenzo/favorites", { "range": "1-100", "count": 100, @@ -291,7 +291,7 @@ class ImgurFavoriteExtractor(ImgurExtractor): class ImgurSubredditExtractor(ImgurExtractor): """Extractor for a subreddits's imgur links""" subcategory = "subreddit" - pattern = BASE_PATTERN + r"/r/([^/?&#]+)" + pattern = BASE_PATTERN + r"/r/([^/?#]+)" test = ("https://imgur.com/r/pics", { "range": "1-100", "count": 100, @@ -305,7 +305,7 @@ class ImgurSubredditExtractor(ImgurExtractor): class ImgurTagExtractor(ImgurExtractor): """Extractor for imgur tag searches""" subcategory = "tag" - pattern = BASE_PATTERN + r"/t/([^/?&#]+)$" + pattern = BASE_PATTERN + r"/t/([^/?#]+)$" test = ("https://imgur.com/t/animals", { "range": "1-100", "count": 100, @@ -319,7 +319,7 @@ class ImgurTagExtractor(ImgurExtractor): class ImgurSearchExtractor(ImgurExtractor): """Extractor for imgur search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search(?:/[^?&#]+)?/?\?q=([^&#]+)" + pattern = BASE_PATTERN + r"/search(?:/[^?#]+)?/?\?q=([^&#]+)" test = ("https://imgur.com/search?q=cute+cat", { "range": "1-100", "count": 100, diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index ff8318c..6051db0 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -60,7 +60,7 @@ class InkbunnyExtractor(Extractor): class InkbunnyUserExtractor(InkbunnyExtractor): """Extractor for inkbunny user profiles""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?([^/?&#]+)" + pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?([^/?#]+)" test = ( ("https://inkbunny.net/soina", { "pattern": r"https://[\w.]+\.metapix\.net/files/full" diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 639f272..1194626 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -343,7 +343,7 @@ class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" subcategory = "image" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?:p|tv|reel)/([^/?&#]+)") + r"/(?:p|tv|reel)/([^/?#]+)") test = ( # GraphImage ("https://www.instagram.com/p/BqvsDleB3lV/", { @@ -458,7 +458,7 @@ class InstagramStoriesExtractor(InstagramExtractor): """Extractor for StoriesPage""" subcategory = "stories" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/stories/([^/?&#]+)(?:/(\d+))?") + r"/stories/([^/?#]+)(?:/(\d+))?") test = ( ("https://www.instagram.com/stories/instagram/"), ("https://www.instagram.com/stories/highlights/18042509488170095/"), @@ -478,7 +478,7 @@ class InstagramSavedExtractor(InstagramExtractor): subcategory = "saved" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" - r"([^/?&#]+)/saved") + r"([^/?#]+)/saved") test = ("https://www.instagram.com/instagram/saved/",) def __init__(self, match): @@ -504,7 +504,7 @@ class InstagramUserExtractor(InstagramExtractor): subcategory = "user" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)" - r"([^/?&#]+)/?(?:$|[?#])") + r"([^/?#]+)/?(?:$|[?#])") test = ( ("https://www.instagram.com/instagram/", { "range": "1-16", @@ -550,7 +550,7 @@ class InstagramChannelExtractor(InstagramExtractor): subcategory = "channel" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" - r"([^/?&#]+)/channel") + r"([^/?#]+)/channel") test = ("https://www.instagram.com/instagram/channel/", { "range": "1-16", "count": ">= 16", @@ -579,7 +579,7 @@ class InstagramTagExtractor(InstagramExtractor): subcategory = "tag" directory_fmt = ("{category}", "{subcategory}", "{tag}") pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/explore/tags/([^/?&#]+)") + r"/explore/tags/([^/?#]+)") test = ("https://www.instagram.com/explore/tags/instagram/", { "range": "1-16", "count": ">= 16", diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index b34b288..6266e5f 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -26,7 +26,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): "{document[originalPublishDate]} {document[title]}") filename_fmt = "{num:>03}.{extension}" archive_fmt = "{document[id]}_{num}" - pattern = r"(?:https?://)?issuu\.com(/[^/?&#]+/docs/[^/?&#]+)" + pattern = r"(?:https?://)?issuu\.com(/[^/?#]+/docs/[^/?#]+)" test = ("https://issuu.com/issuu/docs/motions-1-2019/", { "pattern": r"https://image.isu.pub/190916155301-\w+/jpg/page_\d+.jpg", "count" : 36, @@ -80,7 +80,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): class IssuuUserExtractor(IssuuBase, Extractor): """Extractor for all publications of a user/publisher""" subcategory = "user" - pattern = r"(?:https?://)?issuu\.com/([^/?&#]+)/?$" + pattern = r"(?:https?://)?issuu\.com/([^/?#]+)/?$" test = ("https://issuu.com/issuu", { "pattern": IssuuPublicationExtractor.pattern, "count" : "> 25", diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index 2550af2..6ddf0e8 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -19,7 +19,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): directory_fmt = ("{category}", "{album[name]}") archive_fmt = "{filename}.{extension}" pattern = (r"(?:https?://)?downloads\.khinsider\.com" - r"/game-soundtracks/album/([^/?&#]+)") + r"/game-soundtracks/album/([^/?#]+)") root = "https://downloads.khinsider.com" test = (("https://downloads.khinsider.com" "/game-soundtracks/album/horizon-riders-wii"), { diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 6e7f139..b54afb7 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -45,7 +45,7 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): """Extractor for manga-chapters from komikcast.com""" - pattern = r"(?:https?://)?(?:www\.)?komikcast\.com(/chapter/[^/?&#]+/)" + pattern = r"(?:https?://)?(?:www\.)?komikcast\.com(/chapter/[^/?#]+/)" test = ( (("https://komikcast.com/chapter/" "apotheosis-chapter-02-2-bahasa-indonesia/"), { @@ -81,7 +81,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): """Extractor for manga from komikcast.com""" chapterclass = KomikcastChapterExtractor pattern = (r"(?:https?://)?(?:www\.)?komikcast\.com" - r"(/(?:komik/)?[^/?&#]+)/?$") + r"(/(?:komik/)?[^/?#]+)/?$") test = ( ("https://komikcast.com/komik/090-eko-to-issho/", { "url": "dc798d107697d1f2309b14ca24ca9dba30c6600f", diff --git a/gallery_dl/extractor/lineblog.py b/gallery_dl/extractor/lineblog.py index a1daa39..4071a26 100644 --- a/gallery_dl/extractor/lineblog.py +++ b/gallery_dl/extractor/lineblog.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -42,7 +42,7 @@ class LineblogBase(): class LineblogBlogExtractor(LineblogBase, LivedoorBlogExtractor): """Extractor for a user's blog on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?&#])" + pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?#])" test = ("https://lineblog.me/mamoru_miyano/", { "range": "1-20", "count": 20, diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py index 9d2383f..feffdfd 100644 --- a/gallery_dl/extractor/livedoor.py +++ b/gallery_dl/extractor/livedoor.py @@ -84,7 +84,7 @@ class LivedoorExtractor(Extractor): class LivedoorBlogExtractor(LivedoorExtractor): """Extractor for a user's blog on blog.livedoor.jp""" subcategory = "blog" - pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])" + pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?#])" test = ( ("http://blog.livedoor.jp/zatsu_ke/", { "range": "1-50", diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 7561c64..143d00d 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -46,7 +46,7 @@ class LusciousAlbumExtractor(LusciousExtractor): directory_fmt = ("{category}", "{album[id]} {album[title]}") archive_fmt = "{album[id]}_{id}" pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net" - r"/(?:albums|pictures/c/[^/?&#]+/album)/[^/?&#]+_(\d+)") + r"/(?:albums|pictures/c/[^/?#]+/album)/[^/?#]+_(\d+)") test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { "url": "7e4984a271a1072ac6483e4228a045895aff86f3", diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 7e2d613..d50e0f2 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -83,18 +83,19 @@ class MangadexChapterExtractor(MangadexExtractor): chapter, sep, minor = cdata["chapter"].partition(".") return { - "manga": mdata["manga"]["title"], + "manga" : text.unescape(mdata["manga"]["title"]), "manga_id": cdata["manga_id"], - "artist": mdata["manga"]["artist"], - "author": mdata["manga"]["author"], - "title": text.unescape(cdata["title"]), - "volume": text.parse_int(cdata["volume"]), - "chapter": text.parse_int(chapter), + "artist" : text.unescape(mdata["manga"]["artist"]), + "author" : text.unescape(mdata["manga"]["author"]), + "title" : text.unescape(cdata["title"]), + "volume" : text.parse_int(cdata["volume"]), + "chapter" : text.parse_int(chapter), "chapter_minor": sep + minor, "chapter_id": cdata["id"], - "group": mdata["chapter"][self.chapter_id]["group_name"], - "date": text.parse_timestamp(cdata["timestamp"]), - "lang": util.language_to_code(cdata["lang_name"]), + "group" : text.unescape( + mdata["chapter"][self.chapter_id]["group_name"]), + "date" : text.parse_timestamp(cdata["timestamp"]), + "lang" : util.language_to_code(cdata["lang_name"]), "language": cdata["lang_name"], } @@ -159,18 +160,18 @@ class MangadexMangaExtractor(MangadexExtractor): chapter, sep, minor = info["chapter"].partition(".") lang = self.iso639_map.get(info["lang_code"], info["lang_code"]) results.append({ - "manga": manga["title"], + "manga" : text.unescape(manga["title"]), "manga_id": self.manga_id, - "artist": manga["artist"], - "author": manga["author"], - "title": text.unescape(info["title"]), - "volume": text.parse_int(info["volume"]), - "chapter": text.parse_int(chapter), + "artist" : text.unescape(manga["artist"]), + "author" : text.unescape(manga["author"]), + "title" : text.unescape(info["title"]), + "volume" : text.parse_int(info["volume"]), + "chapter" : text.parse_int(chapter), "chapter_minor": sep + minor, "chapter_id": text.parse_int(chid), - "group": text.unescape(info["group_name"]), - "date": text.parse_timestamp(info["timestamp"]), - "lang": lang, + "group" : text.unescape(info["group_name"]), + "date" : text.parse_timestamp(info["timestamp"]), + "lang" : lang, "language": util.code_to_language(lang), "_extractor": MangadexChapterExtractor, }) diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index 1b8a4a6..a123783 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -16,7 +16,7 @@ class MangafoxChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from fanfox.net""" category = "mangafox" pattern = (r"(?:https?://)?(?:www\.|m\.)?(?:mangafox\.me|fanfox\.net)" - r"(/manga/[^/]+/((?:v(\d+)/)?c(\d+)([^/?&#]*)))") + r"(/manga/[^/]+/((?:v(\d+)/)?c(\d+)([^/?#]*)))") test = ( ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", { "keyword": "5661dab258d42d09d98f194f7172fb9851a49766", @@ -53,9 +53,9 @@ class MangafoxChapterExtractor(ChapterExtractor): pnum = 1 while True: url, pos = text.extract(page, '<img src="', '"') - yield url, None + yield text.ensure_http_scheme(url), None url, pos = text.extract(page, ' src="', '"', pos) - yield url, None + yield text.ensure_http_scheme(url), None pnum += 2 page = self.request("{}/{}.html".format(self.urlbase, pnum)).text diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index 52cc672..653c61a 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -24,12 +24,16 @@ class MangahereBase(): class MangahereChapterExtractor(MangahereBase, ChapterExtractor): """Extractor for manga-chapters from mangahere.cc""" pattern = (r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/" - r"([^/]+(?:/v0*(\d+))?/c([^/?&#]+))") + r"([^/]+(?:/v0*(\d+))?/c([^/?#]+))") test = ( ("https://www.mangahere.cc/manga/dongguo_xiaojie/c004.2/", { "keyword": "7c98d7b50a47e6757b089aa875a53aa970cac66f", "content": "708d475f06893b88549cbd30df1e3f9428f2c884", }), + # URLs without HTTP scheme (#1070) + ("https://www.mangahere.cc/manga/beastars/c196/1.html", { + "pattern": "https://zjcdn.mangahere.org/.*", + }), ("http://www.mangahere.co/manga/dongguo_xiaojie/c003.2/"), ("http://m.mangahere.co/manga/dongguo_xiaojie/c003.2/"), ) @@ -65,9 +69,9 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor): while True: url, pos = text.extract(page, '<img src="', '"') - yield url, None + yield text.ensure_http_scheme(url), None url, pos = text.extract(page, ' src="', '"', pos) - yield url, None + yield text.ensure_http_scheme(url), None pnum += 2 page = self.request(self.url_fmt.format(self.part, pnum)).text diff --git a/gallery_dl/extractor/mangakakalot.py b/gallery_dl/extractor/mangakakalot.py index 8686b2d..951a257 100644 --- a/gallery_dl/extractor/mangakakalot.py +++ b/gallery_dl/extractor/mangakakalot.py @@ -32,7 +32,7 @@ class MangakakalotBase(): class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): """Extractor for manga-chapters from mangakakalot.com""" pattern = (r"(?:https?://)?(?:www\.)?mangakakalot\.com" - r"(/chapter/\w+/chapter_[^/?&#]+)") + r"(/chapter/\w+/chapter_[^/?#]+)") test = ( ("https://mangakakalot.com/chapter/rx922077/chapter_6", { "pattern": r"https://s\d+\.\w+\.com/mangakakalot/r\d+/rx922077/" @@ -78,7 +78,7 @@ class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): } def images(self, page): - page = text.extract(page, 'id="vungdoc"', '\n</div>')[0] + page = text.extract(page, 'id="vungdoc"', '\n<div')[0] return [ (url, None) for url in text.extract_iter(page, '<img src="', '"') diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py index a4b8340..6067bd0 100644 --- a/gallery_dl/extractor/mangapanda.py +++ b/gallery_dl/extractor/mangapanda.py @@ -35,7 +35,7 @@ class MangapandaBase(): class MangapandaChapterExtractor(MangapandaBase, ChapterExtractor): """Extractor for manga-chapters from mangapanda.com""" archive_fmt = "{manga}_{chapter}_{page}" - pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))" + pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?#]+)/(\d+))" test = ("https://www.mangapanda.com/red-storm/2", { "url": "1f633f776e950531ba9b1e81965316458e785261", "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb", @@ -96,7 +96,7 @@ class MangapandaMangaExtractor(MangapandaBase, MangaExtractor): """Extractor for manga from mangapanda.com""" chapterclass = MangapandaChapterExtractor reverse = False - pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com(/[^/?&#]+)/?$" + pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com(/[^/?#]+)/?$" test = ("https://www.mangapanda.com/mushishi", { "url": "357f965732371cac1990fee8b480f62e29141a42", "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 59a046c..0a6fba4 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -51,7 +51,7 @@ class MangaparkBase(): class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" - r"/manga/([^?&#]+/i\d+)") + r"/manga/([^?#]+/i\d+)") test = ( ("https://mangapark.net/manga/gosu/i811653/c055/1", { "count": 50, @@ -117,7 +117,7 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): """Extractor for manga from mangapark.net""" chapterclass = MangaparkChapterExtractor pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" - r"(/manga/[^/?&#]+)/?$") + r"(/manga/[^/?#]+)/?$") test = ( ("https://mangapark.net/manga/aria", { "url": "9b62883c25c8de471f8ab43651e1448536c4ce3f", diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index fd9c7ac..30b8ce3 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -53,7 +53,7 @@ class MangareaderBase(): class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): """Extractor for manga-chapters from mangareader.net""" archive_fmt = "{manga}_{chapter}_{page}" - pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))" + pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?#]+)/(\d+))" test = (("https://www.mangareader.net" "/karate-shoukoushi-kohinata-minoru/11"), { "url": "45ece5668d1e9f65cf2225237d78de58660b54e4", @@ -84,7 +84,7 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): """Extractor for manga from mangareader.net""" chapterclass = MangareaderChapterExtractor reverse = False - pattern = r"(?:https?://)?(?:www\.)?mangareader\.net(/[^/?&#]+)/?$" + pattern = r"(?:https?://)?(?:www\.)?mangareader\.net(/[^/?#]+)/?$" test = ("https://www.mangareader.net/mushishi", { "url": "bc203b858b4ad76e5d77e39118a7be0350e357da", "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index ac17cb0..0e063d5 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -184,7 +184,7 @@ def generate_extractors(): Extr.category = category Extr.instance = instance Extr.pattern = (r"(?:https?://)?" + pattern + - r"/@([^/?&#]+)(?:/media)?/?$") + r"/@([^/?#]+)(?:/media)?/?$") Extr.test = info.get("test-user") Extr.root = root Extr.access_token = token @@ -197,7 +197,7 @@ def generate_extractors(): Extr.__doc__ = "Extractor for images from a status on " + instance Extr.category = category Extr.instance = instance - Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?&#]+/(\d+)" + Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?#]+/(\d+)" Extr.test = info.get("test-status") Extr.root = root Extr.access_token = token diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py index 088fdd6..7bf0084 100644 --- a/gallery_dl/extractor/message.py +++ b/gallery_dl/extractor/message.py @@ -40,7 +40,7 @@ class Message(): - 2nd element is the (external) URL as a string - 3rd element is a dictionary containing URL-specific metadata - - Message.Urllist: + - Message.Urllist: # obsolete - Same as Message.Url, but its 2nd element is a list of multiple URLs - The additional URLs serve as a fallback if the primary one fails """ @@ -51,5 +51,5 @@ class Message(): # Headers = 4 # Cookies = 5 Queue = 6 - Urllist = 7 + # Urllist = 7 Metadata = 8 diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index e2e163a..abb937f 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -21,14 +21,14 @@ class MyportfolioGalleryExtractor(Extractor): archive_fmt = "{user}_{filename}" pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|" r"(?:https?://)?([^.]+\.myportfolio\.com))" - r"(/[^/?&#]+)?") + r"(/[^/?#]+)?") test = ( ("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", { "url": "acea0690c76db0e5cf267648cefd86e921bc3499", "keyword": "6ac6befe2ee0af921d24cf1dd4a4ed71be06db6d", }), ("https://andrewling.myportfolio.com/", { - "pattern": r"https://andrewling\.myportfolio\.com/[^/?&#+]+$", + "pattern": r"https://andrewling\.myportfolio\.com/[^/?#+]+$", "count": ">= 6", }), ("https://stevenilousphotography.myportfolio.com/society", { diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index f9dc886..a6cc5fa 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -39,6 +39,7 @@ class NewgroundsExtractor(Extractor): post = self.extract_post(post_url) url = post.get("url") except Exception: + self.log.debug("", exc_info=True) url = None if url: @@ -49,8 +50,8 @@ class NewgroundsExtractor(Extractor): post["_comment"], 'data-smartload-src="', '"'), 1): post["num"] = num post["_index"] = "{}_{:>02}".format(post["index"], num) - text.nameext_from_url(url, post) - yield Message.Url, url, post + url = text.ensure_http_scheme(url) + yield Message.Url, url, text.nameext_from_url(url, post) else: self.log.warning( "Unable to get download URL for '%s'", post_url) @@ -159,6 +160,7 @@ class NewgroundsExtractor(Extractor): if src: src = src.replace("\\/", "/") + fallback = () date = text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')) else: @@ -168,8 +170,9 @@ class NewgroundsExtractor(Extractor): "X-Requested-With": "XMLHttpRequest", "Referer": self.root, } - data = self.request(url, headers=headers).json() - src = data["sources"]["360p"][0]["src"].replace(".360p.", ".") + sources = self.request(url, headers=headers).json()["sources"] + src = sources["360p"][0]["src"].replace(".360p.", ".") + fallback = self._video_fallback(sources) date = text.parse_timestamp(src.rpartition("?")[2]) return { @@ -181,8 +184,16 @@ class NewgroundsExtractor(Extractor): "rating" : extr('class="rated-', '"'), "index" : text.parse_int(index), "_index" : index, + "_fallback" : fallback, } + @staticmethod + def _video_fallback(sources): + sources = list(sources.items()) + sources.sort(key=lambda src: text.parse_int(src[0][:-1]), reverse=True) + for src in sources: + yield src[1][0]["src"] + def _pagination(self, kind): root = self.user_root headers = { @@ -218,7 +229,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): """Extractor for a single image from newgrounds.com""" subcategory = "image" pattern = (r"(?:https?://)?(?:" - r"(?:www\.)?newgrounds\.com/art/view/([^/?&#]+)/[^/?&#]+" + r"(?:www\.)?newgrounds\.com/art/view/([^/?#]+)/[^/?#]+" r"|art\.ngfiles\.com/images/\d+/\d+_([^_]+)_([^.]+))") test = ( ("https://www.newgrounds.com/art/view/tomfulp/ryu-is-hawt", { diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py index f3608b2..8e29d97 100644 --- a/gallery_dl/extractor/ngomik.py +++ b/gallery_dl/extractor/ngomik.py @@ -18,7 +18,7 @@ class NgomikChapterExtractor(ChapterExtractor): category = "ngomik" root = "http://ngomik.in" pattern = (r"(?:https?://)?(?:www\.)?ngomik\.in" - r"(/[^/?&#]+-chapter-[^/?&#]+)") + r"(/[^/?#]+-chapter-[^/?#]+)") test = ( ("https://www.ngomik.in/14-sai-no-koi-chapter-1-6/", { "url": "8e67fdf751bbc79bc6f4dead7675008ddb8e32a4", diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 5e7e387..15bb576 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -124,7 +124,7 @@ class NozomiTagExtractor(NozomiExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{postid}" - pattern = r"(?:https?://)?nozomi\.la/tag/([^/?&#]+)-\d+\." + pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-\d+\." test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", { "pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$", "count": ">= 25", diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 6d7b27a..4bb2c48 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -315,7 +315,7 @@ class OAuthTumblr(OAuthBase): class OAuthMastodon(OAuthBase): subcategory = "mastodon" - pattern = "oauth:mastodon:(?:https?://)?([^/?&#]+)" + pattern = "oauth:mastodon:(?:https?://)?([^/?#]+)" def __init__(self, match): OAuthBase.__init__(self, match) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index f08055c..57521d6 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -44,7 +44,7 @@ class PahealTagExtractor(PahealExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" - r"/post/list/([^/?&#]+)") + r"/post/list/([^/?#]+)") test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", "count": ">= 15" diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index f1e98d9..ad259f4 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -202,8 +202,8 @@ class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" - r"/(?!(?:home|join|posts|login|signup)(?:$|[/?&#]))" - r"([^/?&#]+)(?:/posts)?/?(?:\?([^#]+))?") + r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))" + r"([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") test = ( ("https://www.patreon.com/koveliana", { "range": "1-25", @@ -283,7 +283,7 @@ class PatreonUserExtractor(PatreonExtractor): class PatreonPostExtractor(PatreonExtractor): """Extractor for media from a single post""" subcategory = "post" - pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?&#]+)" + pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?#]+)" test = ( # postfile + attachments ("https://www.patreon.com/posts/precious-metal-23563293", { diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py index a6456da..5e2120a 100644 --- a/gallery_dl/extractor/photobucket.py +++ b/gallery_dl/extractor/photobucket.py @@ -22,7 +22,7 @@ class PhotobucketAlbumExtractor(Extractor): filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}" archive_fmt = "{id}" pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)" - r"/user/[^/?&#]+/library(?:/[^?&#]*)?") + r"/user/[^/?#]+/library(?:/[^?#]*)?") test = ( ("https://s369.photobucket.com/user/CrpyLrkr/library", { "pattern": r"https?://[oi]+\d+.photobucket.com/albums/oo139/", @@ -111,8 +111,8 @@ class PhotobucketImageExtractor(Extractor): filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}" archive_fmt = "{username}_{id}" pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com" - r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)" - r"|/user/([^/?&#]+)/media/[^?&#]+\.html)") + r"(?:/gallery/user/([^/?#]+)/media/([^/?#]+)" + r"|/user/([^/?#]+)/media/[^?#]+\.html)") test = ( (("https://s271.photobucket.com/user/lakerfanryan" "/media/Untitled-3-1.jpg.html"), { diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py index 064967d..45bd8b5 100644 --- a/gallery_dl/extractor/piczel.py +++ b/gallery_dl/extractor/piczel.py @@ -67,7 +67,7 @@ class PiczelExtractor(Extractor): class PiczelUserExtractor(PiczelExtractor): """Extractor for all images from a user's gallery""" subcategory = "user" - pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$" + pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?#]+)/?$" test = ("https://piczel.tv/gallery/Bikupan", { "range": "1-100", "count": ">= 100", @@ -88,7 +88,7 @@ class PiczelFolderExtractor(PiczelExtractor): directory_fmt = ("{category}", "{user[username]}", "{folder[name]}") archive_fmt = "f{folder[id]}_{id}_{num}" pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv" - r"/gallery/(?!image)([^/?&#]+)/(\d+)") + r"/gallery/(?!image)([^/?#]+)/(\d+)") test = ("https://piczel.tv/gallery/Lulena/1114", { "count": ">= 4", }) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index cc89ac5..aa11289 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -9,7 +9,8 @@ """Extractors for https://www.pinterest.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception +from ..cache import cache import itertools import json @@ -28,6 +29,7 @@ class PinterestExtractor(Extractor): self.api = PinterestAPI(self) def items(self): + self.api.login() data = self.metadata() yield Message.Version, 1 yield Message.Directory, data @@ -98,6 +100,10 @@ class PinterestBoardExtractor(PinterestExtractor): "options": (("sections", True),), "count": 5, }), + # secret board (#1055) + ("https://www.pinterest.de/g1952849/secret/", { + "count": 2, + }), ("https://www.pinterest.com/g1952848/test/", { "exception": exception.GalleryDLException, }), @@ -230,16 +236,22 @@ class PinterestAPI(): "Accept" : "application/json, text/javascript, " "*/*, q=0.01", "Accept-Language" : "en-US,en;q=0.5", - "X-Pinterest-AppState": "active", - "X-APP-VERSION" : "b00dd49", + "Referer" : BASE_URL + "/", "X-Requested-With" : "XMLHttpRequest", + "X-APP-VERSION" : "7a20185", + "X-CSRFToken" : None, + "X-Pinterest-AppState": "active", "Origin" : BASE_URL, - "Referer" : BASE_URL + "/", } def __init__(self, extractor): self.extractor = extractor + csrf_token = util.generate_csrf_token() + self.headers = self.HEADERS.copy() + self.headers["X-CSRFToken"] = csrf_token + self.cookies = {"csrftoken": csrf_token} + def pin(self, pin_id): """Query information about a pin""" options = {"id": pin_id, "field_set_key": "detailed"} @@ -282,12 +294,45 @@ class PinterestAPI(): options = {"board_id": board_id, "add_vase": True} return self._pagination("BoardRelatedPixieFeed", options) + def login(self): + """Login and obtain session cookies""" + username, password = self.extractor._get_auth_info() + if username: + self.cookies.update(self._login_impl(username, password)) + + @cache(maxage=180*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.extractor.log.info("Logging in as %s", username) + + url = self.BASE_URL + "/resource/UserSessionResource/create/" + options = { + "username_or_email": username, + "password" : password, + } + data = {"data": json.dumps({"options": options}), "source_url": ""} + + try: + response = self.extractor.request( + url, method="POST", headers=self.headers, + cookies=self.cookies, data=data) + resource = response.json()["resource_response"] + except (exception.HttpError, ValueError, KeyError): + raise exception.AuthenticationError() + + if resource["status"] != "success": + raise exception.AuthenticationError() + return { + cookie.name: cookie.value + for cookie in response.cookies + } + def _call(self, resource, options): url = "{}/resource/{}Resource/get/".format(self.BASE_URL, resource) params = {"data": json.dumps({"options": options}), "source_url": ""} response = self.extractor.request( - url, params=params, headers=self.HEADERS, fatal=False) + url, params=params, headers=self.headers, + cookies=self.cookies, fatal=False) try: data = response.json() diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index ee8f9bb..a813d0e 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -86,7 +86,7 @@ class PixivUserExtractor(PixivExtractor): subcategory = "user" pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" r"(?:en/)?users/(\d+)(?:/(?:artworks|illustrations|manga)" - r"(?:/([^/?&#]+))?)?/?(?:$|[?#])" + r"(?:/([^/?#]+))?)?/?(?:$|[?#])" r"|member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?" r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))") test = ( @@ -170,7 +170,7 @@ class PixivUserExtractor(PixivExtractor): class PixivMeExtractor(PixivExtractor): """Extractor for pixiv.me URLs""" subcategory = "me" - pattern = r"(?:https?://)?pixiv\.me/([^/?&#]+)" + pattern = r"(?:https?://)?pixiv\.me/([^/?#]+)" test = ( ("https://pixiv.me/del_shannon", { "url": "29c295ce75150177e6b0a09089a949804c708fbf", @@ -243,7 +243,7 @@ class PixivFavoriteExtractor(PixivExtractor): "{user_bookmark[id]} {user_bookmark[account]}") archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?" - r"users/(\d+)/(bookmarks/artworks(?:/([^/?&#]+))?|following)" + r"users/(\d+)/(bookmarks/artworks(?:/([^/?#]+))?|following)" r"|bookmark\.php(?:\?([^#]*))?)") test = ( ("https://www.pixiv.net/en/users/173530/bookmarks/artworks", { @@ -407,7 +407,7 @@ class PixivSearchExtractor(PixivExtractor): archive_fmt = "s_{search[word]}_{id}{num}.{extension}" directory_fmt = ("{category}", "search", "{search[word]}") pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/(?:(?:en/)?tags/([^/?&#]+)(?:/[^/?&#]+)?/?" + r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" r"|search\.php)(?:\?([^#]+))?") test = ( ("https://www.pixiv.net/en/tags/Original", { diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py index d8ac9f6..342f4fa 100644 --- a/gallery_dl/extractor/pixnet.py +++ b/gallery_dl/extractor/pixnet.py @@ -169,7 +169,7 @@ class PixnetUserExtractor(PixnetExtractor): """Extractor for all sets and folders of a pixnet user""" subcategory = "user" url_fmt = "{}{}/album/list" - pattern = BASE_PATTERN + r"()(?:/blog|/album(?:/list)?)?/?(?:$|[?&#])" + pattern = BASE_PATTERN + r"()(?:/blog|/album(?:/list)?)?/?(?:$|[?#])" test = ( ("https://albertayu773.pixnet.net/"), ("https://albertayu773.pixnet.net/blog"), diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index 60ca1fb..f2e964d 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -72,7 +72,7 @@ class PlurkExtractor(Extractor): class PlurkTimelineExtractor(PlurkExtractor): """Extractor for URLs from all posts in a Plurk timeline""" subcategory = "timeline" - pattern = r"(?:https?://)?(?:www\.)?plurk\.com/(?!p/)(\w+)/?(?:$|[?&#])" + pattern = r"(?:https?://)?(?:www\.)?plurk\.com/(?!p/)(\w+)/?(?:$|[?#])" test = ("https://www.plurk.com/plurkapi", { "pattern": r"https?://.+", "count": ">= 23" diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index 6b36cdd..1856c82 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -118,7 +118,7 @@ class PornhubGalleryExtractor(PornhubExtractor): class PornhubUserExtractor(PornhubExtractor): """Extractor for all galleries of a pornhub user""" subcategory = "user" - pattern = (BASE_PATTERN + r"/(users|model)/([^/?&#]+)" + pattern = (BASE_PATTERN + r"/(users|model)/([^/?#]+)" "(?:/photos(?:/(public|private|favorites))?)?/?$") test = ( ("https://www.pornhub.com/users/flyings0l0/photos/public", { diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index e5b4b44..a20312f 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -150,7 +150,7 @@ class ReactorTagExtractor(ReactorExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "{search_tags}_{post_id}_{num}" - pattern = BASE_PATTERN + r"/tag/([^/?&#]+)" + pattern = BASE_PATTERN + r"/tag/([^/?#]+)" test = ("http://anime.reactor.cc/tag/Anime+Art",) def __init__(self, match): @@ -166,7 +166,7 @@ class ReactorSearchExtractor(ReactorTagExtractor): subcategory = "search" directory_fmt = ("{category}", "search", "{search_tags}") archive_fmt = "s_{search_tags}_{post_id}_{num}" - pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" test = ("http://anime.reactor.cc/search?q=Art",) @@ -174,7 +174,7 @@ class ReactorUserExtractor(ReactorExtractor): """Extractor for all posts of a user on *reactor.cc sites""" subcategory = "user" directory_fmt = ("{category}", "user", "{user}") - pattern = BASE_PATTERN + r"/user/([^/?&#]+)" + pattern = BASE_PATTERN + r"/user/([^/?#]+)" test = ("http://anime.reactor.cc/user/Shuster",) def __init__(self, match): @@ -215,7 +215,7 @@ JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))" class JoyreactorTagExtractor(ReactorTagExtractor): """Extractor for tag searches on joyreactor.cc""" category = "joyreactor" - pattern = JR_BASE_PATTERN + r"/tag/([^/?&#]+)" + pattern = JR_BASE_PATTERN + r"/tag/([^/?#]+)" test = ( ("http://joyreactor.cc/tag/Advent+Cirno", { "count": ">= 17", @@ -229,7 +229,7 @@ class JoyreactorTagExtractor(ReactorTagExtractor): class JoyreactorSearchExtractor(ReactorSearchExtractor): """Extractor for search results on joyreactor.cc""" category = "joyreactor" - pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" test = ( ("http://joyreactor.cc/search/Cirno+Gifs", { "range": "1-25", @@ -244,7 +244,7 @@ class JoyreactorSearchExtractor(ReactorSearchExtractor): class JoyreactorUserExtractor(ReactorUserExtractor): """Extractor for all posts of a user on joyreactor.cc""" category = "joyreactor" - pattern = JR_BASE_PATTERN + r"/user/([^/?&#]+)" + pattern = JR_BASE_PATTERN + r"/user/([^/?#]+)" test = ( ("http://joyreactor.cc/user/hemantic"), ("http://joyreactor.com/user/Tacoman123", { @@ -289,7 +289,7 @@ PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)" class PornreactorTagExtractor(ReactorTagExtractor): """Extractor for tag searches on pornreactor.cc""" category = "pornreactor" - pattern = PR_BASE_PATTERN + r"/tag/([^/?&#]+)" + pattern = PR_BASE_PATTERN + r"/tag/([^/?#]+)" test = ( ("http://pornreactor.cc/tag/RiceGnat", { "range": "1-25", @@ -302,7 +302,7 @@ class PornreactorTagExtractor(ReactorTagExtractor): class PornreactorSearchExtractor(ReactorSearchExtractor): """Extractor for search results on pornreactor.cc""" category = "pornreactor" - pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" test = ( ("http://pornreactor.cc/search?q=ecchi+hentai", { "range": "1-25", @@ -315,7 +315,7 @@ class PornreactorSearchExtractor(ReactorSearchExtractor): class PornreactorUserExtractor(ReactorUserExtractor): """Extractor for all posts of a user on pornreactor.cc""" category = "pornreactor" - pattern = PR_BASE_PATTERN + r"/user/([^/?&#]+)" + pattern = PR_BASE_PATTERN + r"/user/([^/?#]+)" test = ( ("http://pornreactor.cc/user/Disillusion", { "range": "1-25", diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 7030c81..ae1749e 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -45,7 +45,7 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): """Extractor for comic-issues from readcomiconline.to""" subcategory = "issue" pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" - r"(/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+))") + r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))") test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", { "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682", "keyword": "30fe110273e871305001f33c18634516a0a51421", @@ -81,7 +81,7 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): chapterclass = ReadcomiconlineIssueExtractor subcategory = "comic" pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" - r"(/Comic/[^/?&#]+/?)$") + r"(/Comic/[^/?#]+/?)$") test = ( ("https://readcomiconline.to/Comic/W-i-t-c-h", { "url": "e231bc2a293edb465133c37a8e36a7e7d94cab14", diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 9c6892a..0be7f17 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -47,6 +47,8 @@ class RedditExtractor(Extractor): urls = [] if submission: + submission["date"] = text.parse_timestamp( + submission["created_utc"]) yield Message.Directory, submission visited.add(submission["id"]) url = submission["url"] @@ -135,7 +137,7 @@ class RedditSubredditExtractor(RedditExtractor): """Extractor for URLs from subreddits on reddit.com""" subcategory = "subreddit" pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/" - r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?(?:$|#)") + r"([^/?#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?(?:$|#)") test = ( ("https://www.reddit.com/r/lavaporn/", { "range": "1-20", @@ -160,7 +162,7 @@ class RedditUserExtractor(RedditExtractor): """Extractor for URLs from posts by a reddit user""" subcategory = "user" pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/u(?:ser)?/" - r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?") + r"([^/?#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?") test = ( ("https://www.reddit.com/user/username/", { "count": ">= 2", @@ -183,7 +185,7 @@ class RedditSubmissionExtractor(RedditExtractor): """Extractor for URLs from a submission on reddit.com""" subcategory = "submission" pattern = (r"(?:https?://)?(?:" - r"(?:\w+\.)?reddit\.com/(?:r/[^/?&#]+/comments|gallery)" + r"(?:\w+\.)?reddit\.com/(?:r/[^/?#]+/comments|gallery)" r"|redd\.it)/([a-z0-9]+)") test = ( ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", { @@ -229,7 +231,7 @@ class RedditImageExtractor(Extractor): subcategory = "image" archive_fmt = "{filename}" pattern = (r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)" - r"/[^/?&#]+(?:\?[^#]*)?") + r"/[^/?#]+(?:\?[^#]*)?") test = ( ("https://i.redd.it/upjtjcx2npzz.jpg", { "url": "0de614900feef103e580b632190458c0b62b641a", diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 96be3d8..0a85be6 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -22,7 +22,7 @@ class RedgifsUserExtractor(RedgifsExtractor): """Extractor for redgifs user profiles""" subcategory = "user" directory_fmt = ("{category}", "{userName}") - pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?&#]+)" + pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?#]+)" test = ("https://www.redgifs.com/users/Natalifiction", { "pattern": r"https://\w+\.(redgifs|gfycat)\.com/[A-Za-z]+\.mp4", "count": ">= 100", @@ -36,7 +36,7 @@ class RedgifsSearchExtractor(RedgifsExtractor): """Extractor for redgifs search results""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") - pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/gifs/browse/([^/?&#]+)" + pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/gifs/browse/([^/?#]+)" test = ("https://www.redgifs.com/gifs/browse/jav", { "pattern": r"https://\w+\.(redgifs|gfycat)\.com/[A-Za-z]+\.mp4", "range": "100-300", diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index 0189fc9..f6ad327 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -30,7 +30,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): filename_fmt = "{filename}.{extension}" archive_fmt = "{date:%Y%m%d}_{filename}" pattern = (r"(?:https?://)?www\.sankakucomplex\.com" - r"/(\d{4}/\d\d/\d\d/[^/?&#]+)") + r"/(\d{4}/\d\d/\d\d/[^/?#]+)") test = ( ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", { "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d", diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index 69962c8..b32a170 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -172,7 +172,7 @@ class SeigaImageExtractor(SeigaExtractor): r"|lohas\.nicoseiga\.jp/(?:thumb|(?:priv|o)/[^/]+/\d+)/)(\d+)") test = ( ("https://seiga.nicovideo.jp/seiga/im5977527", { - "keyword": "f66ba5de33d4ce2cb57f23bb37e1e847e0771c10", + "keyword": "c8339781da260f7fc44894ad9ada016f53e3b12a", "content": "d9202292012178374d57fb0126f6124387265297", }), ("https://seiga.nicovideo.jp/seiga/im123", { @@ -196,4 +196,23 @@ class SeigaImageExtractor(SeigaExtractor): return num def get_images(self): - return ({}, {"image_id": text.parse_int(self.image_id)}) + url = "{}/seiga/im{}".format(self.root, self.image_id) + page = self.request(url, notfound="image").text + + data = text.extract_all(page, ( + ("date" , '<li class="date"><span class="created">', '<'), + ("title" , '<h1 class="title">', '</h1>'), + ("description" , '<p class="discription">', '</p>'), + ))[0] + + data["user"] = text.extract_all(page, ( + ("id" , '<a href="/user/illust/' , '"'), + ("name", '<span itemprop="title">', '<'), + ))[0] + + data["description"] = text.remove_html(data["description"]) + data["image_id"] = text.parse_int(self.image_id) + data["date"] = text.parse_datetime( + data["date"] + ":00+0900", "%Y年%m月%d日 %H:%M:%S%z") + + return (data, data) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 2cef430..41d2e67 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -168,7 +168,7 @@ class SexcomBoardExtractor(SexcomExtractor): subcategory = "board" directory_fmt = ("{category}", "{user}", "{board}") pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user" - r"/([^/?&#]+)/(?!(?:following|pins|repins|likes)/)([^/?&#]+)") + r"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)") test = ("https://www.sex.com/user/ronin17/exciting-hentai/", { "count": ">= 15", }) @@ -193,7 +193,7 @@ class SexcomSearchExtractor(SexcomExtractor): subcategory = "search" directory_fmt = ("{category}", "search", "{search[query]}") pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:" - r"(pic|gif|video)s/([^/?&#]+)|search/(pic|gif|video)s" + r"(pic|gif|video)s/([^/?#]+)|search/(pic|gif|video)s" r")/?(?:\?([^#]+))?)") test = ( ("https://www.sex.com/search/pics?query=ecchi", { diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index a0d34d1..7301cbc 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -18,7 +18,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): archive_fmt = "{image_id}" pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com" r"(?!/(?:album|gifs?|images?|series)(?:/|$))" - r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?&#]+)+)") + r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?#]+)+)") test = ( (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { @@ -84,7 +84,7 @@ class SimplyhentaiImageExtractor(Extractor): filename_fmt = "{category}_{token}{title:?_//}.{extension}" archive_fmt = "{token}" pattern = (r"(?:https?://)?(?:www\.)?(simply-hentai\.com" - r"/(image|gif)/[^/?&#]+)") + r"/(image|gif)/[^/?#]+)") test = ( (("https://www.simply-hentai.com/image" "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), { @@ -138,7 +138,7 @@ class SimplyhentaiVideoExtractor(Extractor): directory_fmt = ("{category}", "{type}s") filename_fmt = "{title}{episode:?_//>02}.{extension}" archive_fmt = "{title}_{episode}" - pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?&#]+)" + pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?#]+)" test = ( ("https://videos.simply-hentai.com/creamy-pie-episode-02", { "pattern": r"https://www\.googleapis\.com/drive/v3/files" diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py index 05ec117..ddd45ce 100644 --- a/gallery_dl/extractor/slickpic.py +++ b/gallery_dl/extractor/slickpic.py @@ -33,7 +33,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor): "{album[id]} {album[title]}") filename_fmt = "{num:>03}_{id}{title:?_//}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/albums/([^/?&#]+)" + pattern = BASE_PATTERN + r"/albums/([^/?#]+)" test = ( ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", { "url": "58bd94ebc80fd906e9879826970b408d54c6da07", diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index 30420a8..8f668df 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -20,7 +20,7 @@ class SlidesharePresentationExtractor(Extractor): filename_fmt = "{presentation}-{num:>02}.{extension}" archive_fmt = "{presentation}_{num}" pattern = (r"(?:https?://)?(?:www\.)?slideshare\.net" - r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)") + r"/(?:mobile/)?([^/?#]+)/([^/?#]+)") test = ( (("https://www.slideshare.net" "/Slideshare/get-started-with-slide-share"), { diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 163102d..cfbd5eb 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -108,7 +108,7 @@ class SmugmugImageExtractor(SmugmugExtractor): """Extractor for individual smugmug images""" subcategory = "image" archive_fmt = "{Image[ImageKey]}" - pattern = BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#-]+)" + pattern = BASE_PATTERN + r"(?:/[^/?#]+)+/i-([^/?#-]+)" test = ( ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { "url": "f624ad7293afd6412a7d34e3950a118596c36c85", @@ -141,7 +141,7 @@ class SmugmugImageExtractor(SmugmugExtractor): class SmugmugPathExtractor(SmugmugExtractor): """Extractor for smugmug albums from URL paths and users""" subcategory = "path" - pattern = BASE_PATTERN + r"((?:/[^/?&#a-fh-mo-z][^/?&#]*)*)/?$" + pattern = BASE_PATTERN + r"((?:/[^/?#a-fh-mo-z][^/?#]*)*)/?$" test = ( ("https://tdm.smugmug.com/Nature/Dove", { "pattern": "smugmug:album:cr4C7f$", diff --git a/gallery_dl/extractor/speakerdeck.py b/gallery_dl/extractor/speakerdeck.py index a3819c7..f5b9171 100644 --- a/gallery_dl/extractor/speakerdeck.py +++ b/gallery_dl/extractor/speakerdeck.py @@ -20,7 +20,7 @@ class SpeakerdeckPresentationExtractor(Extractor): filename_fmt = "{presentation}-{num:>02}.{extension}" archive_fmt = "{presentation}_{num}" pattern = (r"(?:https?://)?(?:www\.)?speakerdeck\.com" - r"/([^/?&#]+)/([^/?&#]+)") + r"/([^/?#]+)/([^/?#]+)") test = ( (("https://speakerdeck.com/speakerdeck/introduction-to-speakerdeck"), { "pattern": r"https://files.speakerdeck.com/presentations/" diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 38b39d4..753f266 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -135,7 +135,7 @@ class SubscribestarExtractor(Extractor): class SubscribestarUserExtractor(SubscribestarExtractor): """Extractor for media from a subscribestar user""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!posts/)([^/?&#]+)" + pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+)" test = ( ("https://www.subscribestar.com/subscribestar", { "count": ">= 20", diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 185f33a..cf57a4d 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -295,7 +295,7 @@ class TumblrPostExtractor(TumblrExtractor): class TumblrTagExtractor(TumblrExtractor): """Extractor for images from a tumblr-user by tag""" subcategory = "tag" - pattern = BASE_PATTERN + r"/tagged/([^/?&#]+)" + pattern = BASE_PATTERN + r"/tagged/([^/?#]+)" test = ("http://demo.tumblr.com/tagged/Times%20Square", { "pattern": (r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg"), "count": 1, diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c98a300..06973b2 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -9,10 +9,8 @@ """Extractors for https://twitter.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache -import hashlib -import time BASE_PATTERN = ( @@ -29,7 +27,6 @@ class TwitterExtractor(Extractor): archive_fmt = "{tweet_id}_{retweet_id}_{num}" cookiedomain = ".twitter.com" root = "https://twitter.com" - sizes = (":orig", ":large", ":medium", ":small") def __init__(self, match): Extractor.__init__(self, match) @@ -39,6 +36,7 @@ class TwitterExtractor(Extractor): self.twitpic = self.config("twitpic", False) self.quoted = self.config("quoted", True) self.videos = self.config("videos", True) + self.cards = self.config("cards", False) self._user_cache = {} def items(self): @@ -58,56 +56,82 @@ class TwitterExtractor(Extractor): self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"]) continue + files = [] + if "extended_entities" in tweet: + self._extract_media(tweet, files) + if "card" in tweet and self.cards: + self._extract_card(tweet, files) if self.twitpic: - self._extract_twitpic(tweet) - if "extended_entities" not in tweet: + self._extract_twitpic(tweet, files) + if not files: continue tdata = self._transform_tweet(tweet) tdata.update(metadata) - yield Message.Directory, tdata - for tdata["num"], media in enumerate( - tweet["extended_entities"]["media"], 1): - - tdata["width"] = media["original_info"].get("width", 0) - tdata["height"] = media["original_info"].get("height", 0) - - if "video_info" in media: - - if self.videos == "ytdl": - url = "ytdl:{}/i/web/status/{}".format( - self.root, tweet["id_str"]) - tdata["extension"] = None - yield Message.Url, url, tdata - - elif self.videos: - video_info = media["video_info"] - variant = max( - video_info["variants"], - key=lambda v: v.get("bitrate", 0), - ) - tdata["duration"] = video_info.get( - "duration_millis", 0) / 1000 - tdata["bitrate"] = variant.get("bitrate", 0) - - url = variant["url"] - text.nameext_from_url(url, tdata) - yield Message.Url, url, tdata - - elif "media_url_https" in media: - url = media["media_url_https"] - urls = [url + size for size in self.sizes] - text.nameext_from_url(url, tdata) - yield Message.Urllist, urls, tdata - - else: - url = media["media_url"] - text.nameext_from_url(url, tdata) - yield Message.Url, url, tdata + for tdata["num"], file in enumerate(files, 1): + file.update(tdata) + url = file.pop("url") + if "extension" not in file: + text.nameext_from_url(url, file) + yield Message.Url, url, file + + def _extract_media(self, tweet, files): + for media in tweet["extended_entities"]["media"]: + width = media["original_info"].get("width", 0), + height = media["original_info"].get("height", 0), + + if "video_info" in media: + if self.videos == "ytdl": + files.append({ + "url": "ytdl:{}/i/web/status/{}".format( + self.root, tweet["id_str"]), + "width" : width, + "height" : height, + "extension": None, + }) + elif self.videos: + video_info = media["video_info"] + variant = max( + video_info["variants"], + key=lambda v: v.get("bitrate", 0), + ) + files.append({ + "url" : variant["url"], + "width" : width, + "height" : height, + "bitrate" : variant.get("bitrate", 0), + "duration": video_info.get( + "duration_millis", 0) / 1000, + }) + elif "media_url_https" in media: + url = media["media_url_https"] + files.append(text.nameext_from_url(url, { + "url" : url + ":orig", + "_fallback": [url+":large", url+":medium", url+":small"], + "width" : width, + "height" : height, + })) + else: + files.append({"url": media["media_url"]}) + + def _extract_card(self, tweet, files): + card = tweet["card"] + if card["name"] in ("summary", "summary_large_image"): + bvals = card["binding_values"] + for prefix in ("photo_image_full_size_", + "summary_photo_image_", + "thumbnail_image_"): + for size in ("original", "x_large", "large", "small"): + key = prefix + size + if key in bvals: + files.append(bvals[key]["image_value"]) + return + else: + url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"]) + files.append({"url": url}) - def _extract_twitpic(self, tweet): - twitpics = [] + def _extract_twitpic(self, tweet, files): for url in tweet["entities"].get("urls", ()): url = url["expanded_url"] if "//twitpic.com/" in url and "/photos/" not in url: @@ -117,15 +141,7 @@ class TwitterExtractor(Extractor): url = text.extract( response.text, 'name="twitter:image" value="', '"')[0] if url: - twitpics.append({ - "original_info": {}, - "media_url" : url, - }) - if twitpics: - if "extended_entities" in tweet: - tweet["extended_entities"]["media"].extend(twitpics) - else: - tweet["extended_entities"] = {"media": twitpics} + files.append({"url": url}) def _transform_tweet(self, tweet): entities = tweet["entities"] @@ -247,7 +263,7 @@ class TwitterTimelineExtractor(TwitterExtractor): """Extractor for all images from a user's timeline""" subcategory = "timeline" pattern = BASE_PATTERN + \ - r"/(?!search)(?:([^/?&#]+)/?(?:$|[?#])|intent/user\?user_id=(\d+))" + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])|intent/user\?user_id=(\d+))" test = ( ("https://twitter.com/supernaturepics", { "range": "1-40", @@ -271,7 +287,7 @@ class TwitterTimelineExtractor(TwitterExtractor): class TwitterMediaExtractor(TwitterExtractor): """Extractor for all images from a user's Media Tweets""" subcategory = "media" - pattern = BASE_PATTERN + r"/(?!search)([^/?&#]+)/media(?!\w)" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)" test = ( ("https://twitter.com/supernaturepics/media", { "range": "1-40", @@ -288,7 +304,7 @@ class TwitterMediaExtractor(TwitterExtractor): class TwitterLikesExtractor(TwitterExtractor): """Extractor for liked tweets""" subcategory = "likes" - pattern = BASE_PATTERN + r"/(?!search)([^/?&#]+)/likes(?!\w)" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)" test = ("https://twitter.com/supernaturepics/likes",) def tweets(self): @@ -326,7 +342,7 @@ class TwitterSearchExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): """Extractor for images from individual tweets""" subcategory = "tweet" - pattern = BASE_PATTERN + r"/([^/?&#]+|i/web)/status/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" test = ( ("https://twitter.com/supernaturepics/status/604341487988576256", { "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", @@ -375,11 +391,16 @@ class TwitterTweetExtractor(TwitterExtractor): "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", "count": 3, }), - # Nitter tweet + # Nitter tweet (#890) ("https://nitter.net/ed1conf/status/1163841619336007680", { "url": "0f6a841e23948e4320af7ae41125e0c5b3cadc98", "content": "f29501e44d88437fe460f5c927b7543fda0f6e34", }), + # Twitter card (#1005) + ("https://twitter.com/billboard/status/1306599586602135555", { + "options": (("cards", True),), + "pattern": r"https://pbs.twimg.com/card_img/1317274761030856707/", + }), # original retweets (#1026) ("https://twitter.com/jessica_3978/status/1296304589591810048", { "options": (("retweets", "original"),), @@ -446,7 +467,7 @@ class TwitterAPI(): cookies = self.extractor.session.cookies # CSRF - csrf = hashlib.md5(str(time.time()).encode()).hexdigest() + csrf = util.generate_csrf_token() self.headers["x-csrf-token"] = csrf cookies.set("ct0", csrf, domain=".twitter.com") diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py index 687ce3c..e10c642 100644 --- a/gallery_dl/extractor/vanillarock.py +++ b/gallery_dl/extractor/vanillarock.py @@ -29,7 +29,7 @@ class VanillarockPostExtractor(VanillarockExtractor): filename_fmt = "{num:>02}.{extension}" archive_fmt = "{filename}" pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com" - r"(/(?!category/|tag/)[^/?&#]+)/?$") + r"(/(?!category/|tag/)[^/?#]+)/?$") test = ("https://vanilla-rock.com/mizuhashi_parsee-5", { "url": "7fb9a4d18d9fa22d7295fee8d94ab5a7a52265dd", "keyword": "b91df99b714e1958d9636748b1c81a07c3ef52c9", @@ -66,7 +66,7 @@ class VanillarockTagExtractor(VanillarockExtractor): """Extractor for vanillarock blog posts by tag or category""" subcategory = "tag" pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com" - r"(/(?:tag|category)/[^?&#]+)") + r"(/(?:tag|category)/[^?#]+)") test = ( ("https://vanilla-rock.com/tag/%e5%b0%84%e5%91%bd%e4%b8%b8%e6%96%87", { "pattern": VanillarockPostExtractor.pattern, diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index a39fbf1..6799784 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -30,6 +30,10 @@ class WeasylExtractor(Extractor): return True return False + def __init__(self, match): + Extractor.__init__(self, match) + self.session.headers['X-Weasyl-API-Key'] = self.config("api-key") + def request_submission(self, submitid): return self.request( "{}/api/submissions/{}/view".format(self.root, submitid)).json() @@ -64,7 +68,7 @@ class WeasylExtractor(Extractor): class WeasylSubmissionExtractor(WeasylExtractor): subcategory = "submission" - pattern = BASE_PATTERN + r"(?:~[\w-]+/submissions|submission)/(\d+)" + pattern = BASE_PATTERN + r"(?:~[\w~-]+/submissions|submission)/(\d+)" test = ( ("https://www.weasyl.com/~fiz/submissions/2031/a-wesley", { "pattern": "https://cdn.weasyl.com/~fiz/submissions/2031/41ebc1c29" @@ -105,12 +109,13 @@ class WeasylSubmissionExtractor(WeasylExtractor): class WeasylSubmissionsExtractor(WeasylExtractor): subcategory = "submissions" - pattern = BASE_PATTERN + r"(?:~|submissions/)([\w-]+)/?$" + pattern = BASE_PATTERN + r"(?:~|submissions/)([\w~-]+)/?$" test = ( ("https://www.weasyl.com/~tanidareal", { "count": ">= 200" }), ("https://www.weasyl.com/submissions/tanidareal"), + ("https://www.weasyl.com/~aro~so") ) def __init__(self, match): @@ -126,7 +131,7 @@ class WeasylSubmissionsExtractor(WeasylExtractor): class WeasylFolderExtractor(WeasylExtractor): subcategory = "folder" directory_fmt = ("{category}", "{owner_login}", "{folder_name}") - pattern = BASE_PATTERN + r"submissions/([\w-]+)\?folderid=(\d+)" + pattern = BASE_PATTERN + r"submissions/([\w~-]+)\?folderid=(\d+)" test = ("https://www.weasyl.com/submissions/tanidareal?folderid=7403", { "count": ">= 12" }) @@ -175,7 +180,7 @@ class WeasylJournalsExtractor(WeasylExtractor): subcategory = "journals" filename_fmt = "{journalid} {title}.{extension}" archive_fmt = "{journalid}" - pattern = BASE_PATTERN + r"journals/([\w-]+)" + pattern = BASE_PATTERN + r"journals/([\w~-]+)" test = ("https://www.weasyl.com/journals/charmander", { "count": ">= 2", }) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index d42730e..55324cb 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -34,7 +34,7 @@ class WebtoonsEpisodeExtractor(WebtoonsExtractor): directory_fmt = ("{category}", "{comic}") filename_fmt = "{episode}-{num:>02}.{extension}" archive_fmt = "{title_no}_{episode}_{num}" - pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+)/(?:[^/?&#]+))" + pattern = (BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)/(?:[^/?#]+))" r"/viewer(?:\?([^#'\"]+))") test = ( (("https://www.webtoons.com/en/comedy/safely-endangered" @@ -97,7 +97,7 @@ class WebtoonsComicExtractor(WebtoonsExtractor): """Extractor for an entire comic on webtoons.com""" subcategory = "comic" categorytransfer = True - pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+))" + pattern = (BASE_PATTERN + r"/([^/?#]+)/([^/?#]+))" r"/list(?:\?([^#]+))") test = ( # english diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 5f11df3..258e89c 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -33,7 +33,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor): "{gallery[id]} {gallery[title]}") filename_fmt = "{num:>03}_{id}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"(/photos/gallery/[^/?&#]+)" + pattern = BASE_PATTERN + r"(/photos/gallery/[^/?#]+)" test = ( ("https://xhamster.com/photos/gallery/11748968", { "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$", @@ -152,7 +152,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor): class XhamsterUserExtractor(XhamsterExtractor): """Extractor for all galleries of an xhamster user""" subcategory = "user" - pattern = BASE_PATTERN + r"/users/([^/?&#]+)(?:/photos)?/?(?:$|[?#])" + pattern = BASE_PATTERN + r"/users/([^/?#]+)(?:/photos)?/?(?:$|[?#])" test = ( ("https://xhamster.com/users/goldenpalomino/photos", { "pattern": XhamsterGalleryExtractor.pattern, @@ -174,7 +174,7 @@ class XhamsterUserExtractor(XhamsterExtractor): while url: extr = text.extract_from(self.request(url).text) while True: - url = extr('thumb-image-container" href="', '"') + url = extr('thumb-image-container role-pop" href="', '"') if not url: break yield Message.Queue, url, data diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index b7d116a..9fdc5aa 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -28,7 +28,7 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): archive_fmt = "{gallery[id]}_{num}" pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" r"/(?:profiles|amateur-channels|model-channels)" - r"/([^/?&#]+)/photos/(\d+)") + r"/([^/?#]+)/photos/(\d+)") test = ( ("https://www.xvideos.com/profiles/pervertedcouple/photos/751031", { "url": "cb4657a37eea5ab6b1d333491cee7eeb529b0645", @@ -94,7 +94,7 @@ class XvideosUserExtractor(XvideosBase, Extractor): subcategory = "user" categorytransfer = True pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" - r"/profiles/([^/?&#]+)/?(?:#.*)?$") + r"/profiles/([^/?#]+)/?(?:#.*)?$") test = ( ("https://www.xvideos.com/profiles/pervertedcouple", { "url": "a413f3e60d6d3a2de79bd44fa3b7a9c03db4336e", diff --git a/gallery_dl/extractor/yuki.py b/gallery_dl/extractor/yuki.py index 0844c40..72d7cad 100644 --- a/gallery_dl/extractor/yuki.py +++ b/gallery_dl/extractor/yuki.py @@ -19,7 +19,7 @@ class YukiThreadExtractor(Extractor): directory_fmt = ("{category}", "{board}", "{thread}{title:? - //}") filename_fmt = "{time}-{filename}.{extension}" archive_fmt = "{board}_{thread}_{tim}" - pattern = r"(?:https?://)?yuki\.la/([^/?&#]+)/(\d+)" + pattern = r"(?:https?://)?yuki\.la/([^/?#]+)/(\d+)" test = ( ("https://yuki.la/gd/309639", { "url": "289e86c5caf673a2515ec5f5f521ac0ae7e189e9", diff --git a/gallery_dl/job.py b/gallery_dl/job.py index b62240b..66dea08 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -110,12 +110,6 @@ class Job(): if self.pred_queue(url, kwds): self.handle_queue(url, kwds) - elif msg[0] == Message.Urllist: - _, urls, kwds = msg - if self.pred_url(urls[0], kwds): - self.update_kwdict(kwds) - self.handle_urllist(urls, kwds) - elif msg[0] == Message.Metadata: self.update_kwdict(msg[1]) self.handle_metadata(msg[1]) @@ -130,10 +124,6 @@ class Job(): def handle_url(self, url, kwdict): """Handle Message.Url""" - def handle_urllist(self, urls, kwdict): - """Handle Message.Urllist""" - self.handle_url(urls[0], kwdict) - def handle_directory(self, kwdict): """Handle Message.Directory""" @@ -215,7 +205,7 @@ class DownloadJob(Job): else: self.visited = set() - def handle_url(self, url, kwdict, fallback=None): + def handle_url(self, url, kwdict): """Download the resource specified in 'url'""" postprocessors = self.postprocessors pathfmt = self.pathfmt @@ -246,7 +236,7 @@ class DownloadJob(Job): if not self.download(url): # use fallback URLs if available - for num, url in enumerate(fallback or (), 1): + for num, url in enumerate(kwdict.get("_fallback", ()), 1): util.remove_file(pathfmt.temppath) self.log.info("Trying fallback URL #%d", num) if self.download(url): @@ -279,12 +269,6 @@ class DownloadJob(Job): pp.run_after(pathfmt) self._skipcnt = 0 - def handle_urllist(self, urls, kwdict): - """Download the resource specified in 'url'""" - fallback = iter(urls) - url = next(fallback) - self.handle_url(url, kwdict, fallback) - def handle_directory(self, kwdict): """Set and create the target directory for downloads""" if not self.pathfmt: @@ -563,15 +547,11 @@ class UrlJob(Job): self.handle_queue = self.handle_url @staticmethod - def handle_url(url, _): + def handle_url(url, kwdict): print(url) - - @staticmethod - def handle_urllist(urls, _): - prefix = "" - for url in urls: - print(prefix, url, sep="") - prefix = "| " + if "_fallback" in kwdict: + for url in kwdict["_fallback"]: + print("|", url) def handle_queue(self, url, _): try: @@ -625,9 +605,6 @@ class DataJob(Job): def handle_url(self, url, kwdict): self.data.append((Message.Url, url, self.filter(kwdict))) - def handle_urllist(self, urls, kwdict): - self.data.append((Message.Urllist, list(urls), self.filter(kwdict))) - def handle_directory(self, kwdict): self.data.append((Message.Directory, self.filter(kwdict))) diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 9a716f9..8b06384 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -70,7 +70,7 @@ def ensure_http_scheme(url, scheme="https://"): def filename_from_url(url): """Extract the last part of an URL to use as a filename""" try: - return urllib.parse.urlsplit(url).path.rpartition("/")[2] + return url.partition("?")[0].rpartition("/")[2] except (TypeError, AttributeError): return "" diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 3e91405..d85d2b3 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -13,6 +13,7 @@ import os import sys import json import time +import random import shutil import string import _string @@ -60,6 +61,10 @@ def raises(cls): return wrap +def generate_csrf_token(): + return random.getrandbits(128).to_bytes(16, "big").hex() + + def combine_dict(a, b): """Recursively combine the contents of 'b' into 'a'""" for key, value in b.items(): @@ -490,6 +495,7 @@ class Formatter(): - "u": calls str.upper - "c": calls str.capitalize - "C": calls string.capwords + - "t": calls str.strip - "U": calls urllib.parse.unquote - "S": calls util.to_string() - Example: {f!l} -> "example"; {f!u} -> "EXAMPLE" @@ -520,6 +526,7 @@ class Formatter(): "u": str.upper, "c": str.capitalize, "C": string.capwords, + "t": str.strip, "U": urllib.parse.unquote, "S": to_string, "s": str, diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 81976c2..b2e5a58 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.15.1" +__version__ = "1.15.2" diff --git a/test/test_results.py b/test/test_results.py index a594032..d54017e 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -25,13 +25,14 @@ TRAVIS_SKIP = { "exhentai", "mangafox", "dynastyscans", "nijie", "instagram", "ngomik", "archivedmoe", "archiveofsins", "thebarchive", "fireden", "4plebs", "sankaku", "idolcomplex", "mangahere", "mangadex", "sankakucomplex", - "warosu", "fuskator", "patreon", "komikcast", + "warosu", "fuskator", "patreon", "komikcast", "twitter", } # temporary issues, etc. BROKEN = { + "imgbox", "imagevenue", - "ngomik", + "mangapanda", "photobucket", } @@ -296,6 +297,7 @@ class TestFormatter(util.Formatter): def setup_test_config(): name = "gallerydl" email = "gallerydl@openaliasbox.org" + email2 = "gallerydl@protonmail.com" config.clear() config.set(("cache",), "file", None) @@ -307,6 +309,7 @@ def setup_test_config(): config.set(("extractor", "nijie") , "username", email) config.set(("extractor", "seiga") , "username", email) + config.set(("extractor", "pinterest") , "username", email2) config.set(("extractor", "newgrounds"), "username", "d1618111") config.set(("extractor", "newgrounds"), "password", "d1618111") diff --git a/test/test_util.py b/test/test_util.py index 1515814..08ecd64 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -265,6 +265,7 @@ class TestFormatter(unittest.TestCase): "d": {"a": "foo", "b": 0, "c": None}, "l": ["a", "b", "c"], "n": None, + "s": " \n\r\tSPACE ", "u": "%27%3C%20/%20%3E%27", "name": "Name", "title1": "Title", @@ -278,6 +279,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{a!u}", "HELLO WORLD") self._run_test("{a!c}", "Hello world") self._run_test("{a!C}", "Hello World") + self._run_test("{s!t}", "SPACE") self._run_test("{a!U}", self.kwdict["a"]) self._run_test("{u!U}", "'< / >'") self._run_test("{a!s}", self.kwdict["a"]) |