diff options
author | Unit 193 <unit193@ubuntu.com> | 2020-03-16 23:20:22 -0400 |
---|---|---|
committer | Unit 193 <unit193@ubuntu.com> | 2020-03-16 23:20:22 -0400 |
commit | f1baa4aa12d705e290f74c9fb4c6cd5eb2976fa2 (patch) | |
tree | 70267e5f04db1da396e75fd4148d9c542683bbab | |
parent | 2bd320e568d015940227b7355396701331e2cd1e (diff) | |
parent | e8cc000750de972384f2f34d02d42222b4018ae9 (diff) | |
download | gallery-dl-f1baa4aa12d705e290f74c9fb4c6cd5eb2976fa2.tar.bz2 gallery-dl-f1baa4aa12d705e290f74c9fb4c6cd5eb2976fa2.tar.xz gallery-dl-f1baa4aa12d705e290f74c9fb4c6cd5eb2976fa2.tar.zst |
Update upstream source from tag 'upstream/1.13.2'
Update to upstream version '1.13.2'
with Debian dir a36309ac1ae7b23d042eaafd21c4267c2f840ab4
67 files changed, 2825 insertions, 934 deletions
@@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.12.3 +Version: 1.13.2 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -141,6 +141,13 @@ Description: ========== $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" + Filter manga chapters by language and chapter number: + + .. code:: bash + + $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + + | Search a remote resource for URLs and download images from them: | (URLs for which no extractor can be found will be silently ignored) @@ -186,8 +193,8 @@ Description: ========== Some extractors require you to provide valid login-credentials in the form of a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` - and optional (but strongly recommended) for - ``danbooru``, ``exhentai``, ``idolcomplex``, ``instagram``, + and optional for + ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, ``luscious``, ``sankaku``, ``tsumino``, and ``twitter``. You can set the necessary information in your configuration file @@ -240,7 +247,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.3.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.13.2.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -130,6 +130,13 @@ Get the direct URL of an image from a site that requires authentication: $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" +Filter manga chapters by language and chapter number: + +.. code:: bash + + $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + + | Search a remote resource for URLs and download images from them: | (URLs for which no extractor can be found will be silently ignored) @@ -175,8 +182,8 @@ Username & Password Some extractors require you to provide valid login-credentials in the form of a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` -and optional (but strongly recommended) for -``danbooru``, ``exhentai``, ``idolcomplex``, ``instagram``, +and optional for +``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, ``luscious``, ``sankaku``, ``tsumino``, and ``twitter``. You can set the necessary information in your configuration file @@ -229,7 +236,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.3.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.13.2.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index a2cd77d..304c345 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-01-19" "1.12.3" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2020-03-14" "1.13.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index d7bb941..4ad93f8 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-01-19" "1.12.3" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2020-03-14" "1.13.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -139,6 +139,18 @@ segment, which will be joined together and appended to the .IP "Description:" 4 Directory path used as the base for all download destinations. +.SS extractor.*.parent-directory +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Use an extractor's current target directory as +\f[I]base-directory <extractor.*.base-directory_>\f[] +for any spawned child extractors. + .SS extractor.*.path-restrict .IP "Type:" 6 \f[I]string\f[] @@ -155,12 +167,12 @@ in generated path segment names. Special values: - +.br * \f[I]"auto"\f[]: Use characters from \f[I]"unix"\f[] or \f[I]"windows"\f[] depending on the local operating system - +.br * \f[I]"unix"\f[]: \f[I]"/"\f[] - +.br * \f[I]"windows"\f[]: \f[I]"\\\\\\\\|/<>:\\"?*"\f[] Note: In a set with 2 or more characters, \f[I][]^-\\\f[] need to be @@ -191,24 +203,24 @@ Controls the behavior when downloading files that have been downloaded before, i.e. a file with the same filename already exists or its ID is in a \f[I]download archive\f[]. - +.br * \f[I]true\f[]: Skip downloads - +.br * \f[I]false\f[]: Overwrite already existing files - +.br * \f[I]"abort"\f[]: Abort the current extractor run - +.br * \f[I]"abort:N"\f[]: Skip downloads and abort extractor run after \f[I]N\f[] consecutive skips - +.br * \f[I]"exit"\f[]: Exit the program altogether - +.br * \f[I]"exit:N"\f[]: Skip downloads and exit the program after \f[I]N\f[] consecutive skips - +.br * \f[I]"enumerate"\f[]: Add an enumeration index to the beginning of the filename extension (\f[I]file.1.ext\f[], \f[I]file.2.ext\f[], etc.) @@ -233,18 +245,43 @@ Number of seconds to sleep before each download. The username and password to use when attempting to log in to another site. -Specifying username and password is required for the -\f[I]pixiv\f[], \f[I]nijie\f[], and \f[I]seiga\f[] -modules and optional (but strongly recommended) for -\f[I]danbooru\f[], \f[I]exhentai\f[], \f[I]idolcomplex\f[], \f[I]instagram\f[], -\f[I]luscious\f[], \f[I]sankaku\f[], \f[I]tsumino\f[], and \f[I]twitter\f[]. +Specifying a username and password is required for + +.br +* \f[I]pixiv\f[] +.br +* \f[I]nijie\f[] +.br +* \f[I]seiga\f[] + +and optional for + +.br +* \f[I]danbooru\f[] +.br +* \f[I]e621\f[] +.br +* \f[I]exhentai\f[] +.br +* \f[I]idolcomplex\f[] +.br +* \f[I]instagram\f[] +.br +* \f[I]luscious\f[] +.br +* \f[I]sankaku\f[] +.br +* \f[I]tsumino\f[] +.br +* \f[I]twitter\f[] These values can also be set via the \f[I]-u/--username\f[] and \f[I]-p/--password\f[] command-line options or by using a \f[I].netrc\f[] file. (see Authentication_) -Note: The password for \f[I]danbooru\f[] is the API key found in your -user profile, not the password for your account. +Note: The password values for \f[I]danbooru\f[] and \f[I]e621\f[] should be +the API keys found in your user profile, not your actual account +password. .SS extractor.*.netrc .IP "Type:" 6 @@ -264,14 +301,23 @@ Enable the use of \f[I].netrc\f[] authentication data. \f[I]null\f[] .IP "Description:" 4 -Source to read additional cookies from. +Source to read additional cookies from. Either as +.br +* the \f[I]Path\f[] to a Mozilla/Netscape format cookies.txt file or +.br +* a JSON \f[I]object\f[] specifying cookies as a name-to-value mapping -* If this is a \f[I]Path\f[], it specifies a -Mozilla/Netscape format cookies.txt file. +Example: + +.. code:: + +{ +"cookie-name": "cookie-value", +"sessionid" : "14313336321%3AsabDFvuASDnlpb%3A31", +"isAdult" : "1" +} -* If this is an \f[I]object\f[], its key-value pairs, which should both -be \f[I]strings\f[], will be used as cookie-names and -values. .SS extractor.*.cookies-update .IP "Type:" 6 @@ -281,8 +327,9 @@ be \f[I]strings\f[], will be used as cookie-names and -values. \f[I]true\f[] .IP "Description:" 4 -If \f[I]extractor.*.cookies\f[] specifies a cookies.txt file, update its -contents with cookies received during data extraction. +If \f[I]extractor.*.cookies\f[] specifies the \f[I]Path\f[] to a cookies.txt +file and it can be opened and parsed without errors, +update its contents with cookies received during data extraction. .SS extractor.*.proxy .IP "Type:" 6 @@ -294,10 +341,10 @@ contents with cookies received during data extraction. .IP "Description:" 4 Proxy (or proxies) to be used for remote connections. - +.br * If this is a \f[I]string\f[], it is the proxy URL for all outgoing requests. - +.br * If this is an \f[I]object\f[], it is a scheme-to-proxy mapping to specify different proxy URLs for each scheme. It is also possible to set a proxy for a specific host by using @@ -482,8 +529,8 @@ Note: The index of the first image is \f[I]1\f[]. \f[I]string\f[] .IP "Description:" 4 -Like \f[I]image-range\f[], but applies to delegated URLs -like manga-chapters, etc. +Like \f[I]image-range <extractor.*.image-range_>\f[], +but applies to delegated URLs like manga-chapters, etc. .SS extractor.*.image-filter .IP "Type:" 6 @@ -509,9 +556,15 @@ by \f[I]-K\f[] or \f[I]-j\f[]. .IP "Type:" 6 \f[I]string\f[] +.IP "Example:" 4 +"lang == 'en'" +.br +"language == 'French' and 10 <= chapter < 20" +.br + .IP "Description:" 4 -Like \f[I]image-filter\f[], but applies to delegated URLs -like manga-chapters, etc. +Like \f[I]image-filter <extractor.*.image-filter_>\f[], +but applies to delegated URLs like manga-chapters, etc. .SS extractor.*.image-unique .IP "Type:" 6 @@ -559,6 +612,16 @@ See \f[I]strptime\f[] for a list of formatting directives. .IP "Description:" 4 Try to follow external URLs of embedded players. +.SS extractor.blogger.videos +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download embedded videos hosted on https://www.blogger.com/ + .SS extractor.danbooru.ugoira .IP "Type:" 6 \f[I]bool\f[] @@ -569,9 +632,9 @@ Try to follow external URLs of embedded players. .IP "Description:" 4 Controls the download target for Ugoira posts. - +.br * \f[I]true\f[]: Original ZIP archives - +.br * \f[I]false\f[]: Converted video files .SS extractor.deviantart.extra @@ -597,9 +660,9 @@ Note: Enabling this option also enables deviantart.metadata_. Select the directory structure created by the Gallery- and Favorite-Extractors. - +.br * \f[I]true\f[]: Use a flat directory structure. - +.br * \f[I]false\f[]: Collect a list of all gallery-folders or favorites-collections and transfer any further work to other extractors (\f[I]folder\f[] or \f[I]collection\f[]), which will then @@ -648,11 +711,11 @@ You can use \f[I]"all"\f[] instead of listing all values separately. .IP "Description:" 4 Selects the output format of journal entries. - +.br * \f[I]"html"\f[]: HTML with (roughly) the same layout as on DeviantArt. - +.br * \f[I]"text"\f[]: Plain text with image references and HTML tags removed. - +.br * \f[I]"none"\f[]: Don't download journals. .SS extractor.deviantart.mature @@ -720,9 +783,9 @@ The \f[I]refresh-token\f[] value you get from Using a \f[I]refresh-token\f[] allows you to access private or otherwise not publicly available deviations. -Note: Authenticating with a \f[I]refresh-token\f[] requires persistent -storage in a \f[I]cache file <cache.file_>\f[]. -Otherwise the token will become invalid after its first use. +Note: The \f[I]refresh-token\f[] becomes invalid +\f[I]after 3 months <https://www.deviantart.com/developers/authentication#refresh>\f[] +or whenever your \f[I]cache file <cache.file_>\f[] is deleted or cleared. .SS extractor.deviantart.wait-min .IP "Type:" 6 @@ -804,14 +867,33 @@ Extract and download videos. .IP "Description:" 4 Sets the maximum allowed size for downloaded images. - +.br * If this is an \f[I]integer\f[], it specifies the maximum image dimension (width and height) in pixels. - +.br * If this is a \f[I]string\f[], it should be one of Flickr's format specifiers (\f[I]"Original"\f[], \f[I]"Large"\f[], ... or \f[I]"o"\f[], \f[I]"k"\f[], \f[I]"h"\f[], \f[I]"l"\f[], ...) to use as an upper limit. +.SS extractor.furaffinity.include +.IP "Type:" 6 +\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"gallery"\f[] + +.IP "Example:" 4 +"scraps,favorite" or ["scraps", "favorite"] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are +\f[I]"gallery"\f[], \f[I]"scraps"\f[], \f[I]"favorite"\f[]. + +You can use \f[I]"all"\f[] instead of listing all values separately. + .SS extractor.gelbooru.api .IP "Type:" 6 \f[I]bool\f[] @@ -840,6 +922,18 @@ If the selected format is not available, \f[I]"mp4"\f[], \f[I]"webm"\f[] and \f[I]"gif"\f[] (in that order) will be tried instead, until an available format is found. +.SS extractor.hitomi.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Try to extract +\f[I]artist\f[], \f[I]group\f[], \f[I]parody\f[], and \f[I]characters\f[] +metadata. + .SS extractor.imgur.mp4 .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -850,12 +944,12 @@ available format is found. .IP "Description:" 4 Controls whether to choose the GIF or MP4 version of an animation. - +.br * \f[I]true\f[]: Follow Imgur's advice and choose MP4 if the \f[I]prefer_video\f[] flag in an image's metadata is set. - +.br * \f[I]false\f[]: Always choose GIF. - +.br * \f[I]"always"\f[]: Always choose MP4. .SS extractor.instagram.highlights @@ -889,9 +983,9 @@ Download video files. .IP "Description:" 4 Controls how to handle redirects to CAPTCHA pages. - +.br * \f[I]"stop\f[]: Stop the current extractor run. - +.br * \f[I]"wait\f[]: Ask the user to solve the CAPTCHA and wait. .SS extractor.newgrounds.include @@ -923,12 +1017,27 @@ You can use \f[I]"all"\f[] instead of listing all values separately. .IP "Description:" 4 Controls how a user is directed to an OAuth authorization site. - +.br * \f[I]true\f[]: Use Python's \f[I]webbrowser.open()\f[] method to automatically open the URL in the user's browser. - +.br * \f[I]false\f[]: Ask the user to copy & paste an URL from the terminal. +.SS extractor.oauth.port +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]6414\f[] + +.IP "Description:" 4 +Port number to listen on during OAuth authorization. + +Note: All redirects will go to http://localhost:6414/, regardless +of the port specified here. You'll have to manually adjust the +port number in your browser's address bar when using a different +port than the default. + .SS extractor.photobucket.subalbums .IP "Type:" 6 \f[I]bool\f[] @@ -939,6 +1048,16 @@ open the URL in the user's browser. .IP "Description:" 4 Download subalbums. +.SS extractor.pixiv.user.avatar +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download user avatars. + .SS extractor.pixiv.ugoira .IP "Type:" 6 \f[I]bool\f[] @@ -949,9 +1068,12 @@ Download subalbums. .IP "Description:" 4 Download Pixiv's Ugoira animations or ignore them. -These animations come as a \f[I].zip\f[] file containing all the single +These animations come as a \f[I].zip\f[] file containing all animation frames in JPEG format. +Use an \f[I]ugoira\f[] post processor to convert them +to watchable videos. (Example__) + .SS extractor.plurk.comments .IP "Type:" 6 \f[I]bool\f[] @@ -983,9 +1105,9 @@ during the extraction process. .IP "Description:" 4 Controls how to handle redirects to CAPTCHA pages. - +.br * \f[I]"stop\f[]: Stop the current extractor run. - +.br * \f[I]"wait\f[]: Ask the user to solve the CAPTCHA and wait. .SS extractor.recursive.blacklist @@ -1066,9 +1188,9 @@ This value sets the maximum recursion depth. Special values: - +.br * \f[I]0\f[]: Recursion is disabled - +.br * \f[I]-1\f[]: Infinite recursion (don't do this) .SS extractor.reddit.refresh-token @@ -1088,6 +1210,25 @@ authorized to do so, but requests to the reddit API are going to be rate limited at 600 requests every 10 minutes/600 seconds. +.SS extractor.reddit.videos +.IP "Type:" 6 +\f[I]bool\f[] or \f[I]string\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Control video download behavior. + +.br +* \f[I]true\f[]: Download videos and use \f[I]youtube-dl\f[] to handle +HLS and DASH manifests +.br +* \f[I]"ytdl"\f[]: Download videos and let \f[I]youtube-dl\f[] handle all of +video extraction and download +.br +* \f[I]false\f[]: Ignore videos + .SS extractor.sankaku.wait-min & .wait-max .IP "Type:" 6 \f[I]float\f[] @@ -1161,11 +1302,11 @@ Search posts for inline images and videos. \f[I]true\f[] .IP "Description:" 4 - +.br * \f[I]true\f[]: Extract media from reblogged posts - +.br * \f[I]false\f[]: Skip reblogged posts - +.br * \f[I]"same-blog"\f[]: Skip reblogged posts unless the original post is from the same blog @@ -1222,18 +1363,18 @@ Extract \f[I]TwitPic <https://twitpic.com/>\f[] embeds. \f[I]bool\f[] or \f[I]string\f[] .IP "Default:" 9 -\f[I]false\f[] +\f[I]true\f[] .IP "Description:" 4 Control video download behavior. - +.br * \f[I]true\f[]: Download videos and use \f[I]youtube-dl\f[] to handle HLS \f[I].m3u8\f[] manifests - +.br * \f[I]"ytdl"\f[]: Download videos and let \f[I]youtube-dl\f[] handle all of video extraction and download - +.br * \f[I]false\f[]: Skip video Tweets .SS extractor.vsco.videos @@ -1282,9 +1423,9 @@ Note: This requires 1 additional HTTP request for each post. .IP "Description:" 4 Reverse the order of chapter URLs extracted from manga pages. - +.br * \f[I]true\f[]: Start with the latest chapter - +.br * \f[I]false\f[]: Start with the first chapter .SH DOWNLOADER OPTIONS @@ -1319,11 +1460,11 @@ to set file modification times. .IP "Description:" 4 Controls the use of \f[I].part\f[] files during file downloads. - +.br * \f[I]true\f[]: Write downloaded data into \f[I].part\f[] files and rename them upon download completion. This mode additionally supports resuming incomplete downloads. - +.br * \f[I]false\f[]: Do not use \f[I].part\f[] files and write data directly into the actual output files. @@ -1451,9 +1592,9 @@ used to generate filenames for files downloaded with youtube-dl. Special values: - +.br * \f[I]null\f[]: generate filenames with \f[I]extractor.*.filename\f[] - +.br * \f[I]"default"\f[]: use youtube-dl's default, currently \f[I]"%(title)s-%(id)s.%(ext)s"\f[] Note: An output template other than \f[I]null\f[] might @@ -1492,15 +1633,15 @@ All available options can be found in \f[I]youtube-dl's docstrings .IP "Description:" 4 Controls the output string format and status indicators. - +.br * \f[I]"null"\f[]: No output - +.br * \f[I]"pipe"\f[]: Suitable for piping to other processes or files - +.br * \f[I]"terminal"\f[]: Suitable for the standard Windows console - +.br * \f[I]"color"\f[]: Suitable for terminals that understand ANSI escape codes and colors - +.br * \f[I]"auto"\f[]: Automatically choose the best suitable output mode .SS output.shorten @@ -1525,12 +1666,12 @@ on one console line. Controls the progress indicator when *gallery-dl* is run with multiple URLs as arguments. - +.br * \f[I]true\f[]: Show the default progress indicator (\f[I]"[{current}/{total}] {url}"\f[]) - +.br * \f[I]false\f[]: Do not show any progress indicator - +.br * Any \f[I]string\f[]: Show the progress indicator using this as a custom \f[I]format string\f[]. Possible replacement keys are \f[I]current\f[], \f[I]total\f[] and \f[I]url\f[]. @@ -1614,9 +1755,9 @@ in their default location. .IP "Description:" 4 The action to take when files do not compare as equal. - +.br * \f[I]"replace"\f[]: Replace/Overwrite the old version with the new one - +.br * \f[I]"enumerate"\f[]: Add an enumeration index to the filename of the new version like \f[I]skip = "enumerate" <extractor.*.skip_>\f[] @@ -1654,13 +1795,13 @@ or to let it run asynchronously. .IP "Description:" 4 The command to run. - +.br * If this is a \f[I]string\f[], it will be executed using the system's shell, e.g. \f[I]/bin/sh\f[]. Any \f[I]{}\f[] will be replaced with the full path of a file or target directory, depending on \f[I]exec.final\f[] - +.br * If this is a \f[I]list\f[], the first element specifies the program name and any further elements its arguments. Each element of this list is treated as a \f[I]format string\f[] using @@ -1689,12 +1830,12 @@ have been downloaded successfully. .IP "Description:" 4 Select how to write metadata. - +.br * \f[I]"json"\f[]: all metadata using \f[I]json.dump() <https://docs.python.org/3/library/json.html#json.dump>\f[] - +.br * \f[I]"tags"\f[]: \f[I]tags\f[] separated by newlines - +.br * \f[I]"custom"\f[]: result of applying \f[I]metadata.content-format\f[] to a file's metadata dictionary @@ -1827,12 +1968,12 @@ Enable Two-Pass encoding. .IP "Description:" 4 Controls the frame rate argument (\f[I]-r\f[]) for FFmpeg - +.br * \f[I]"auto"\f[]: Automatically assign a fitting frame rate based on delays between frames. - +.br * any other \f[I]string\f[]: Use this value as argument for \f[I]-r\f[]. - +.br * \f[I]null\f[] or an empty \f[I]string\f[]: Don't set an explicit frame rate. .SS ugoira.keep-files @@ -1904,11 +2045,11 @@ Keep the actual files after writing them to a ZIP archive. \f[I]"default"\f[] .IP "Description:" 4 - +.br * \f[I]"default"\f[]: Write the central directory file header once after everything is done or an exception is raised. - +.br * \f[I]"safe"\f[]: Update the central directory file header each time a file is stored in a ZIP archive. @@ -1922,9 +2063,9 @@ case the Python interpreter gets shut down unexpectedly \f[I]Path\f[] .IP "Default:" 9 - +.br * \f[I]tempfile.gettempdir()\f[] + \f[I]".gallery-dl.cache"\f[] on Windows - +.br * (\f[I]$XDG_CACHE_HOME\f[] or \f[I]"~/.cache"\f[]) + \f[I]"/gallery-dl/cache.sqlite3"\f[] on all other platforms .IP "Description:" 4 @@ -1942,11 +2083,11 @@ this cache. \f[I]true\f[] .IP "Description:" 4 - +.br * \f[I]true\f[]: Update urllib3's default cipher list - +.br * \f[I]false\f[]: Leave the default cipher list as is - +.br * Any \f[I]string\f[]: Replace urllib3's default ciphers with these (See \f[I]SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>\f[] for details) @@ -1968,25 +2109,25 @@ SSL-support. \f[I]string\f[] .IP "How To:" 4 - +.br * login and visit DeviantArt's \f[I]Applications & Keys <https://www.deviantart.com/developers/apps>\f[] section - +.br * click "Register Application" - +.br * scroll to "OAuth2 Redirect URI Whitelist (Required)" and enter "https://mikf.github.io/gallery-dl/oauth-redirect.html" - +.br * scroll to the bottom and agree to the API License Agreement. Submission Policy, and Terms of Service. - +.br * click "Save" - +.br * copy \f[I]client_id\f[] and \f[I]client_secret\f[] of your new application and put them in your configuration file as \f[I]"client-id"\f[] and \f[I]"client-secret"\f[] - +.br * get a new \f[I]refresh-token <extractor.deviantart.refresh-token_>\f[] if necessary @@ -1995,15 +2136,15 @@ if necessary \f[I]string\f[] .IP "How To:" 4 - +.br * login and \f[I]Create an App <https://www.flickr.com/services/apps/create/apply/>\f[] in Flickr's \f[I]App Garden <https://www.flickr.com/services/>\f[] - +.br * click "APPLY FOR A NON-COMMERCIAL KEY" - +.br * fill out the form with a random name and description and click "SUBMIT" - +.br * copy \f[I]Key\f[] and \f[I]Secret\f[] and put them in your configuration file @@ -2020,19 +2161,19 @@ file \f[I]string\f[] .IP "How To:" 4 - +.br * login and visit the \f[I]apps <https://www.reddit.com/prefs/apps/>\f[] section of your account's preferences - +.br * click the "are you a developer? create an app..." button - +.br * fill out the form, choose "installed app", preferably set "http://localhost:6414/" as "redirect uri" and finally click "create app" - +.br * copy the client id (third line, under your application's name and "installed app") and put it in your configuration file - +.br * use "\f[I]Python:<application name>:v1.0 (by /u/<username>)\f[]" as user-agent and replace \f[I]<application name>\f[] and \f[I]<username>\f[] accordingly (see Reddit's @@ -2043,15 +2184,15 @@ accordingly (see Reddit's \f[I]string\f[] .IP "How To:" 4 - +.br * login and \f[I]Apply for an API Key <https://api.smugmug.com/api/developer/apply>\f[] - +.br * use a random name and description, set "Type" to "Application", "Platform" to "All", and "Use" to "Non-Commercial" - +.br * fill out the two checkboxes at the bottom and click "Apply" - +.br * copy \f[I]API Key\f[] and \f[I]API Secret\f[] and put them in your configuration file @@ -2060,20 +2201,20 @@ and put them in your configuration file \f[I]string\f[] .IP "How To:" 4 - +.br * login and visit Tumblr's \f[I]Applications <https://www.tumblr.com/oauth/apps>\f[] section - +.br * click "Register application" - +.br * fill out the form: use a random name and description, set https://example.org/ as "Application Website" and "Default callback URL" - +.br * solve Google's "I'm not a robot" challenge and click "Register" - +.br * click "Show secret key" (below "OAuth Consumer Key") - +.br * copy your \f[I]OAuth Consumer Key\f[] and \f[I]Secret Key\f[] and put them in your configuration file @@ -2093,9 +2234,9 @@ and put them in your configuration file .IP "Description:" 4 A \f[I]Date\f[] value represents a specific point in time. - +.br * If given as \f[I]string\f[], it is parsed according to \f[I]date-format\f[]. - +.br * If given as \f[I]integer\f[], it is interpreted as UTC timestamp. .SS Path @@ -2156,9 +2297,9 @@ The path \f[I]C:\\path\\to\\file.ext\f[] has therefore to be written as .IP "Description:" 4 Extended logging output configuration. - +.br * format - +.br * General format string for logging messages or a dictionary with format strings for each loglevel. @@ -2169,39 +2310,39 @@ it is also possible to access the current and \f[I]job <https://github.com/mikf/gallery-dl/blob/2e516a1e3e09cb8a9e36a8f6f7e41ce8d4402f5a/gallery_dl/job.py#L19>\f[] objects as well as their attributes (e.g. \f[I]"{extractor.url}"\f[]) - +.br * Default: \f[I]"[{name}][{levelname}] {message}"\f[] - +.br * format-date - +.br * Format string for \f[I]{asctime}\f[] fields in logging messages (see \f[I]strftime() directives <https://docs.python.org/3/library/time.html#time.strftime>\f[]) - +.br * Default: \f[I]"%Y-%m-%d %H:%M:%S"\f[] - +.br * level - +.br * Minimum logging message level (one of \f[I]"debug"\f[], \f[I]"info"\f[], \f[I]"warning"\f[], \f[I]"error"\f[], \f[I]"exception"\f[]) - +.br * Default: \f[I]"info"\f[] - +.br * path - +.br * \f[I]Path\f[] to the output file - +.br * mode - +.br * Mode in which the file is opened; use \f[I]"w"\f[] to truncate or \f[I]"a"\f[] to append (see \f[I]open() <https://docs.python.org/3/library/functions.html#open>\f[]) - +.br * Default: \f[I]"w"\f[] - +.br * encoding - +.br * File encoding - +.br * Default: \f[I]"utf-8"\f[] Note: path, mode and encoding are only applied when configuring diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 8f4897f..92ded16 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.12.3 +Version: 1.13.2 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -141,6 +141,13 @@ Description: ========== $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" + Filter manga chapters by language and chapter number: + + .. code:: bash + + $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + + | Search a remote resource for URLs and download images from them: | (URLs for which no extractor can be found will be silently ignored) @@ -186,8 +193,8 @@ Description: ========== Some extractors require you to provide valid login-credentials in the form of a username & password pair. This is necessary for ``pixiv``, ``nijie``, and ``seiga`` - and optional (but strongly recommended) for - ``danbooru``, ``exhentai``, ``idolcomplex``, ``instagram``, + and optional for + ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``, ``luscious``, ``sankaku``, ``tsumino``, and ``twitter``. You can set the necessary information in your configuration file @@ -240,7 +247,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.3.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.13.2.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index bbe9bbe..ecb052c 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -34,10 +34,12 @@ gallery_dl/extractor/35photo.py gallery_dl/extractor/3dbooru.py gallery_dl/extractor/4chan.py gallery_dl/extractor/500px.py +gallery_dl/extractor/8kun.py gallery_dl/extractor/8muses.py gallery_dl/extractor/__init__.py gallery_dl/extractor/adultempire.py gallery_dl/extractor/artstation.py +gallery_dl/extractor/bcy.py gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py gallery_dl/extractor/bobx.py @@ -53,6 +55,7 @@ gallery_dl/extractor/fallenangels.py gallery_dl/extractor/flickr.py gallery_dl/extractor/foolfuuka.py gallery_dl/extractor/foolslide.py +gallery_dl/extractor/furaffinity.py gallery_dl/extractor/fuskator.py gallery_dl/extractor/gelbooru.py gallery_dl/extractor/gfycat.py @@ -61,8 +64,10 @@ gallery_dl/extractor/hentai2read.py gallery_dl/extractor/hentaicafe.py gallery_dl/extractor/hentaifoundry.py gallery_dl/extractor/hentaifox.py +gallery_dl/extractor/hentaihand.py gallery_dl/extractor/hentaihere.py gallery_dl/extractor/hentainexus.py +gallery_dl/extractor/hiperdex.py gallery_dl/extractor/hitomi.py gallery_dl/extractor/hypnohub.py gallery_dl/extractor/idolcomplex.py @@ -75,6 +80,7 @@ gallery_dl/extractor/imgth.py gallery_dl/extractor/imgur.py gallery_dl/extractor/instagram.py gallery_dl/extractor/issuu.py +gallery_dl/extractor/kabeuchi.py gallery_dl/extractor/keenspot.py gallery_dl/extractor/khinsider.py gallery_dl/extractor/kissmanga.py @@ -142,7 +148,6 @@ gallery_dl/extractor/wikiart.py gallery_dl/extractor/xhamster.py gallery_dl/extractor/xvideos.py gallery_dl/extractor/yandere.py -gallery_dl/extractor/yaplog.py gallery_dl/extractor/yuki.py gallery_dl/postprocessor/__init__.py gallery_dl/postprocessor/classify.py @@ -153,6 +158,7 @@ gallery_dl/postprocessor/metadata.py gallery_dl/postprocessor/mtime.py gallery_dl/postprocessor/ugoira.py gallery_dl/postprocessor/zip.py +test/test_cache.py test/test_config.py test/test_cookies.py test/test_downloader.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index ffaed3d..6fba5e2 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -129,17 +129,8 @@ def main(): for opts in args.options: config.set(*opts) - # stream logging handler - output.configure_logging_handler( - "log", logging.getLogger().handlers[0]) - - # file logging handler - handler = output.setup_logging_handler( - "logfile", lvl=args.loglevel) - if handler: - logging.getLogger().addHandler(handler) - # loglevels + output.configure_logging(args.loglevel) if args.loglevel >= logging.ERROR: config.set(("output",), "mode", "null") elif args.loglevel <= logging.DEBUG: diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py index c48b53f..6cde65d 100644 --- a/gallery_dl/cache.py +++ b/gallery_dl/cache.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -96,12 +96,12 @@ class DatabaseCacheDecorator(): # database lookup fullkey = "%s-%s" % (self.key, key) - cursor = self.cursor() - try: - cursor.execute("BEGIN EXCLUSIVE") - except sqlite3.OperationalError: - pass # Silently swallow exception - workaround for Python 3.6 - try: + with self.database() as db: + cursor = db.cursor() + try: + cursor.execute("BEGIN EXCLUSIVE") + except sqlite3.OperationalError: + pass # Silently swallow exception - workaround for Python 3.6 cursor.execute( "SELECT value, expires FROM data WHERE key=? LIMIT 1", (fullkey,), @@ -118,37 +118,38 @@ class DatabaseCacheDecorator(): "INSERT OR REPLACE INTO data VALUES (?,?,?)", (fullkey, pickle.dumps(value), expires), ) - finally: - self.db.commit() + self.cache[key] = value, expires return value def update(self, key, value): expires = int(time.time()) + self.maxage self.cache[key] = value, expires - self.cursor().execute( - "INSERT OR REPLACE INTO data VALUES (?,?,?)", - ("%s-%s" % (self.key, key), pickle.dumps(value), expires), - ) + with self.database() as db: + db.execute( + "INSERT OR REPLACE INTO data VALUES (?,?,?)", + ("%s-%s" % (self.key, key), pickle.dumps(value), expires), + ) def invalidate(self, key): try: del self.cache[key] except KeyError: pass - self.cursor().execute( - "DELETE FROM data WHERE key=? LIMIT 1", - ("%s-%s" % (self.key, key),), - ) + with self.database() as db: + db.execute( + "DELETE FROM data WHERE key=?", + ("%s-%s" % (self.key, key),), + ) - def cursor(self): + def database(self): if self._init: self.db.execute( "CREATE TABLE IF NOT EXISTS data " "(key TEXT PRIMARY KEY, value TEXT, expires INTEGER)" ) DatabaseCacheDecorator._init = False - return self.db.cursor() + return self.db def memcache(maxage=None, keyarg=None): diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py index 6e23c83..6ba5480 100644 --- a/gallery_dl/cloudflare.py +++ b/gallery_dl/cloudflare.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -38,7 +38,7 @@ def solve_challenge(session, response, kwargs): params = cf_kwargs["data"] = collections.OrderedDict() page = response.text - url = root + text.extract(page, 'action="', '"')[0] + url = root + text.unescape(text.extract(page, 'action="', '"')[0]) params["r"] = text.extract(page, 'name="r" value="', '"')[0] params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0] params["pass"] = text.extract(page, 'name="pass" value="', '"')[0] diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 9cd2aa6..844e422 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -193,6 +193,9 @@ class HttpDownloader(DownloaderBase): mtype = response.headers.get("Content-Type", "image/jpeg") mtype = mtype.partition(";")[0] + if "/" not in mtype: + mtype = "image/" + mtype + if mtype in MIMETYPE_MAP: return MIMETYPE_MAP[mtype] @@ -231,6 +234,8 @@ MIMETYPE_MAP = { "image/png": "png", "image/gif": "gif", "image/bmp": "bmp", + "image/x-bmp": "bmp", + "image/x-ms-bmp": "bmp", "image/webp": "webp", "image/svg+xml": "svg", @@ -247,6 +252,7 @@ MIMETYPE_MAP = { "application/zip": "zip", "application/x-zip": "zip", "application/x-zip-compressed": "zip", + "application/rar": "rar", "application/x-rar": "rar", "application/x-rar-compressed": "rar", "application/x-7z-compressed": "7z", diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index ac96211..3773ee5 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -67,7 +67,7 @@ class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor): r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)" r"(?:\?(?P<query>[^#]*))?") test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", { - "url": "8b1a5c5b7a10f8f5d3d6124d1aabfee0277078cb", + "pattern": r"http://behoimi\.org/data/../../[0-9a-f]{32}\.jpg", "count": 20, }) diff --git a/gallery_dl/extractor/8kun.py b/gallery_dl/extractor/8kun.py new file mode 100644 index 0000000..7162920 --- /dev/null +++ b/gallery_dl/extractor/8kun.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://8kun.top/""" + +from .common import Extractor, Message +from .. import text + + +class _8kunThreadExtractor(Extractor): + """Extractor for 8kun threads""" + category = "8kun" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{time}{num:?-//} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?8kun\.top/([^/]+)/res/(\d+)" + test = ("https://8kun.top/test/res/65248.html", { + "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+", + "count": ">= 8", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "https://8kun.top/{}/res/{}.json".format(self.board, self.thread) + posts = self.request(url).json()["posts"] + title = posts[0].get("sub") or text.remove_html(posts[0]["com"]) + process = self._process + + data = { + "board" : self.board, + "thread": self.thread, + "title" : text.unescape(title)[:50], + "num" : 0, + } + + yield Message.Version, 1 + yield Message.Directory, data + for post in posts: + if "filename" in post: + yield process(post, data) + if "extra_files" in post: + for post["num"], filedata in enumerate( + post["extra_files"], 1): + yield process(post, filedata) + + @staticmethod + def _process(post, data): + post.update(data) + post["extension"] = post["ext"][1:] + url = "https://media.8kun.top/file_store/" + post["tim"] + post["ext"] + return Message.Url, url, post + + +class _8kunBoardExtractor(Extractor): + """Extractor for 8kun boards""" + category = "8kun" + subcategory = "board" + pattern = r"(?:https?://)?8kun\.top/([^/?&#]+)/(?:index|\d+)\.html" + test = ( + ("https://8kun.top/v/index.html", { + "pattern": _8kunThreadExtractor.pattern, + "count": ">= 100", + }), + ("https://8kun.top/v/2.html"), + ("https://8kun.top/v/index.html?PageSpeed=noscript"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def items(self): + url = "https://8kun.top/{}/threads.json".format(self.board) + threads = self.request(url).json() + + for page in threads: + for thread in page["threads"]: + url = "https://8kun.top/{}/res/{}.html".format( + self.board, thread["no"]) + thread["page"] = page["page"] + thread["_extractor"] = _8kunThreadExtractor + yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index 089a0e9..dec5972 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -40,7 +40,7 @@ class _8musesAlbumExtractor(Extractor): "parent" : 10454, "views" : int, "likes" : int, - "date" : "type:datetime", + "date" : "dt:2018-07-10 00:00:00", }, }, }), diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 66203fe..74c553d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -15,9 +15,11 @@ modules = [ "3dbooru", "4chan", "500px", + "8kun", "8muses", "adultempire", "artstation", + "bcy", "behance", "blogger", "bobx", @@ -28,6 +30,7 @@ modules = [ "exhentai", "fallenangels", "flickr", + "furaffinity", "fuskator", "gelbooru", "gfycat", @@ -36,8 +39,10 @@ modules = [ "hentaicafe", "hentaifoundry", "hentaifox", + "hentaihand", "hentaihere", "hentainexus", + "hiperdex", "hitomi", "hypnohub", "idolcomplex", @@ -49,6 +54,7 @@ modules = [ "imgur", "instagram", "issuu", + "kabeuchi", "keenspot", "khinsider", "kissmanga", @@ -110,7 +116,6 @@ modules = [ "xhamster", "xvideos", "yandere", - "yaplog", "yuki", "foolfuuka", "foolslide", diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py new file mode 100644 index 0000000..c3049a4 --- /dev/null +++ b/gallery_dl/extractor/bcy.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://bcy.net/""" + +from .common import Extractor, Message +from .. import text +import json +import re + + +class BcyExtractor(Extractor): + """Base class for bcy extractors""" + category = "bcy" + directory_fmt = ("{category}", "{user[id]} {user[name]}") + filename_fmt = "{post[id]} {id}.{extension}" + archive_fmt = "{post[id]}_{id}" + root = "https://bcy.net" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item_id = match.group(1) + + def items(self): + sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub + iroot = "https://img-bcy-qn.pstatp.com" + noop = self.config("noop") + + for post in self.posts(): + if not post["image_list"]: + continue + + multi = None + tags = post.get("post_tags") or () + data = { + "user": { + "id" : post["uid"], + "name" : post["uname"], + "avatar" : sub(iroot, post["avatar"].partition("~")[0]), + }, + "post": { + "id" : text.parse_int(post["item_id"]), + "tags" : [t["tag_name"] for t in tags], + "date" : text.parse_timestamp(post["ctime"]), + "parody" : post["work"], + "content": post["plain"], + "likes" : post["like_count"], + "shares" : post["share_count"], + "replies": post["reply_count"], + }, + } + + yield Message.Directory, data + for data["num"], image in enumerate(post["image_list"], 1): + data["id"] = image["mid"] + data["width"] = image["w"] + data["height"] = image["h"] + + url = image["path"].partition("~")[0] + text.nameext_from_url(url, data) + + if data["extension"]: + if not url.startswith(iroot): + url = sub(iroot, url) + data["filter"] = "" + yield Message.Url, url, data + + else: + if not multi: + if len(post["multi"]) < len(post["image_list"]): + multi = self._data_from_post(post["item_id"]) + multi = multi["post_data"]["multi"] + else: + multi = post["multi"] + image = multi[data["num"] - 1] + + if image["origin"]: + data["filter"] = "watermark" + yield Message.Url, image["origin"], data + + if noop: + data["extension"] = "" + data["filter"] = "noop" + yield Message.Url, image["original_path"], data + + def posts(self): + """Returns an iterable with all relevant 'post' objects""" + + def _data_from_post(self, post_id): + url = "{}/item/detail/{}".format(self.root, post_id) + page = self.request(url).text + return json.loads( + text.extract(page, 'JSON.parse("', '");')[0] + .replace('\\\\u002F', '/') + .replace('\\"', '"') + )["detail"] + + +class BcyUserExtractor(BcyExtractor): + """Extractor for user timelines""" + subcategory = "user" + pattern = r"(?:https?://)?bcy\.net/u/(\d+)" + test = ( + ("https://bcy.net/u/1933712", { + "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg", + "count": ">= 25", + }), + ("https://bcy.net/u/109282764041", { + "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" + r"~tplv-banciyuan-logo-v3:.+\.image", + "range": "1-25", + "count": 25, + }), + ) + + def posts(self): + url = self.root + "/apiv3/user/selfPosts" + params = {"uid": self.item_id, "since": None} + + while True: + data = self.request(url, params=params).json() + + item = None + for item in data["data"]["items"]: + yield item["item_detail"] + + if not item: + return + params["since"] = item["since"] + + +class BcyPostExtractor(BcyExtractor): + """Extractor for individual posts""" + subcategory = "post" + pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)" + test = ( + ("https://bcy.net/item/detail/6355835481002893070", { + "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3", + "count": 1, + "keyword": { + "user": { + "id" : 1933712, + "name" : "wukloo", + "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/", + }, + "post": { + "id" : 6355835481002893070, + "tags" : list, + "date" : "dt:2016-11-22 08:47:46", + "parody" : "东方PROJECT", + "content": "re:根据微博的建议稍微做了点修改", + "likes" : int, + "shares" : int, + "replies": int, + }, + "id": 8330182, + "num": 1, + "width" : 3000, + "height": 1687, + "filename": "712e0780b09011e696f973c3d1568337", + "extension": "jpg", + }, + }), + # only watermarked images available + ("https://bcy.net/item/detail/6780546160802143236", { + "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" + r"~tplv-banciyuan-logo-v3:.+\.image", + "count": 8, + "keyword": {"filter": "watermark"} + }), + # only visible to logged in users + ("https://bcy.net/item/detail/6747523535150783495", { + "count": 0, + }), + ) + + def posts(self): + data = self._data_from_post(self.item_id) + post = data["post_data"] + post["image_list"] = post["multi"] + post["plain"] = text.parse_unicode_escapes(post["plain"]) + post.update(data["detail_user"]) + return (post,) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 31bbaf8..2657b5d 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text +import json import re BASE_PATTERN = ( @@ -28,6 +29,7 @@ class BloggerExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) + self.videos = self.config("videos", True) self.blog = match.group(1) or match.group(2) self.api = BloggerAPI(self) @@ -41,24 +43,41 @@ class BloggerExtractor(Extractor): del blog["selfLink"] sub = re.compile(r"/s\d+/").sub - findall = re.compile( - r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall + findall_image = re.compile( + r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall + findall_video = re.compile( + r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall for post in self.posts(blog): - images = findall(post["content"]) - if not images: + content = post["content"] + + files = findall_image(content) + for idx, url in enumerate(files): + files[idx] = sub("/s0/", url).replace("http:", "https:", 1) + + if self.videos and 'id="BLOG_video-' in content: + page = self.request(post["url"]).text + for url in findall_video(page): + page = self.request(url).text + video_config = json.loads(text.extract( + page, 'var VIDEO_CONFIG =', '\n')[0]) + files.append(max( + video_config["streams"], + key=lambda x: x["format_id"], + )["play_url"]) + + if not files: continue post["author"] = post["author"]["displayName"] post["replies"] = post["replies"]["totalItems"] - post["content"] = text.remove_html(post["content"]) + post["content"] = text.remove_html(content) post["date"] = text.parse_datetime(post["published"]) del post["selfLink"] del post["blog"] yield Message.Directory, {"blog": blog, "post": post} - for num, url in enumerate(images, 1): - url = sub("/s0/", url).replace("http:", "https:", 1) + for num, url in enumerate(files, 1): yield Message.Url, url, text.nameext_from_url(url, { "blog": blog, "post": post, @@ -80,7 +99,7 @@ class BloggerPostExtractor(BloggerExtractor): "pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg", "keyword": { "blog": { - "date" : "type:datetime", + "date" : "dt:2010-11-21 18:19:42", "description": "", "id" : "5623928067739466034", "kind" : "blogger#blog", @@ -95,7 +114,7 @@ class BloggerPostExtractor(BloggerExtractor): "post": { "author" : "Julian Bunker", "content" : str, - "date" : "type:datetime", + "date" : "dt:2010-12-26 01:08:00", "etag" : str, "id" : "6955139236418998998", "kind" : "blogger#post", @@ -112,6 +131,11 @@ class BloggerPostExtractor(BloggerExtractor): ("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", { "url": "9928429fb62f712eb4de80f53625eccecc614aae", }), + # video (#587) + (("http://cfnmscenesinmovies.blogspot.com/2011/11/" + "cfnm-scene-jenna-fischer-in-office.html"), { + "pattern": r"https://.+\.googlevideo\.com/videoplayback", + }), ) def __init__(self, match): @@ -171,8 +195,8 @@ class BloggerAPI(): def _pagination(self, endpoint, params): while True: data = self._call(endpoint, params) - yield from data["items"] - + if "items" in data: + yield from data["items"] if "nextPageToken" not in data: return params["pageToken"] = data["nextPageToken"] diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index ac45e0b..162e9cc 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -27,7 +27,6 @@ class BooruExtractor(SharedConfigMixin, Extractor): page_start = 1 page_limit = None sort = False - ugoira = True def __init__(self, match): super().__init__(match) @@ -52,11 +51,7 @@ class BooruExtractor(SharedConfigMixin, Extractor): for image in images: try: - if "pixiv_ugoira_frame_data" in image and \ - "large_file_url" in image and not self.ugoira: - url = image["large_file_url"] - else: - url = image["file_url"] + url = image["file_url"] except KeyError: continue if url.startswith("/"): @@ -112,12 +107,6 @@ class XmlParserMixin(): return [post.attrib for post in root] -class DanbooruPageMixin(): - """Pagination for Danbooru v2""" - def update_page(self, data): - self.params["page"] = "b{}".format(data["id"]) - - class MoebooruPageMixin(): """Pagination for Moebooru and Danbooru v1""" def update_page(self, data): @@ -214,8 +203,8 @@ class PostMixin(): self.params["tags"] = "id:" + self.post -class PopularMixin(): - """Extraction and metadata handling for Danbooru v2""" +class MoebooruPopularMixin(): + """Extraction and metadata handling for Moebooru and Danbooru v1""" subcategory = "popular" directory_fmt = ("{category}", "popular", "{scale}", "{date}") archive_fmt = "P_{scale[0]}_{date}_{id}" @@ -225,37 +214,20 @@ class PopularMixin(): def __init__(self, match): super().__init__(match) self.params.update(text.parse_query(match.group("query"))) + self.scale = match.group("scale") def get_metadata(self, fmt="%Y-%m-%d"): - date = self.get_date() or datetime.datetime.utcnow().strftime(fmt) + date = self.get_date() or datetime.date.today().isoformat() scale = self.get_scale() or "day" if scale == "week": - dt = datetime.datetime.strptime(date, fmt) - dt -= datetime.timedelta(days=dt.weekday()) - date = dt.strftime(fmt) + date = datetime.date.fromisoformat(date) + date = (date - datetime.timedelta(days=date.weekday())).isoformat() elif scale == "month": date = date[:-3] return {"date": date, "scale": scale} - def get_scale(self): - if "scale" in self.params: - return self.params["scale"] - return None - - def get_date(self): - if "date" in self.params: - return self.params["date"][:10] - return None - - -class MoebooruPopularMixin(PopularMixin): - """Extraction and metadata handling for Moebooru and Danbooru v1""" - def __init__(self, match): - super().__init__(match) - self.scale = match.group("scale") - def get_date(self): if "year" in self.params: return "{:>04}-{:>02}-{:>02}".format( diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 55b15d4..19ee182 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -16,7 +16,6 @@ import logging import datetime import requests import threading -import http.cookiejar from .message import Message from .. import config, text, util, exception, cloudflare @@ -40,6 +39,7 @@ class Extractor(): self._cookiefile = None self._cookiejar = self.session.cookies + self._parentdir = "" self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) @@ -197,13 +197,13 @@ class Extractor(): self._update_cookies_dict(cookies, self.cookiedomain) elif isinstance(cookies, str): cookiefile = util.expand_path(cookies) - cookiejar = http.cookiejar.MozillaCookieJar() try: - cookiejar.load(cookiefile) - except OSError as exc: + with open(cookiefile) as fp: + cookies = util.load_cookiestxt(fp) + except Exception as exc: self.log.warning("cookies: %s", exc) else: - self._cookiejar.update(cookiejar) + self._update_cookies(cookies) self._cookiefile = cookiefile else: self.log.warning( @@ -218,11 +218,9 @@ class Extractor(): def _store_cookies(self): """Store the session's cookiejar in a cookies.txt file""" if self._cookiefile and self.config("cookies-update", True): - cookiejar = http.cookiejar.MozillaCookieJar() - for cookie in self._cookiejar: - cookiejar.set_cookie(cookie) try: - cookiejar.save(self._cookiefile) + with open(self._cookiefile, "w") as fp: + util.save_cookiestxt(fp, self._cookiejar) except OSError as exc: self.log.warning("cookies: %s", exc) @@ -248,15 +246,22 @@ class Extractor(): def _check_cookies(self, cookienames, *, domain=None): """Check if all 'cookienames' are in the session's cookiejar""" + if not self._cookiejar: + return False + if domain is None: domain = self.cookiedomain - names = set(cookienames) + now = time.time() + for cookie in self._cookiejar: - if cookie.domain == domain: - names.discard(cookie.name) - if not names: - return True + if cookie.name in names and cookie.domain == domain: + if cookie.expires and cookie.expires < now: + self.log.warning("Cookie '%s' has expired", cookie.name) + else: + names.discard(cookie.name) + if not names: + return True return False def _get_date_min_max(self, dmin=None, dmax=None): @@ -491,12 +496,6 @@ def generate_extractors(extractor_data, symtable, classes): symtable[Extr.__name__] = prev = Extr -# Reduce strictness of the expected magic string in cookiejar files. -# (This allows the use of Wget-generated cookiejars without modification) -http.cookiejar.MozillaCookieJar.magic_re = re.compile( - "#( Netscape)? HTTP Cookie File", re.IGNORECASE) - - # Undo automatic pyOpenSSL injection by requests pyopenssl = config.get((), "pyopenssl", False) if not pyopenssl: diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index e8d3abf..3fdeaf9 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -1,69 +1,154 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://danbooru.donmai.us/""" +"""Extractors for https://danbooru.donmai.us/""" -from . import booru +from .common import Extractor, Message, SharedConfigMixin +from .. import text +import datetime BASE_PATTERN = ( r"(?:https?://)?" - r"(?P<subdomain>danbooru|hijiribe|sonohara|safebooru)" - r"\.donmai\.us") + r"(danbooru|hijiribe|sonohara|safebooru)" + r"\.donmai\.us" +) -class DanbooruExtractor(booru.DanbooruPageMixin, booru.BooruExtractor): +class DanbooruExtractor(SharedConfigMixin, Extractor): """Base class for danbooru extractors""" + basecategory = "booru" category = "danbooru" + filename_fmt = "{category}_{id}_{md5}.{extension}" page_limit = 1000 + page_start = None + per_page = 100 def __init__(self, match): - super().__init__(match) - self.subdomain = match.group("subdomain") - self.scheme = "https" if self.subdomain == "danbooru" else "http" - self.api_url = "{scheme}://{subdomain}.donmai.us/posts.json".format( - scheme=self.scheme, subdomain=self.subdomain) + Extractor.__init__(self, match) + self.root = "https://{}.donmai.us".format(match.group(1)) self.ugoira = self.config("ugoira", True) + self.params = {} username, api_key = self._get_auth_info() if username: self.log.debug("Using HTTP Basic Auth for user '%s'", username) self.session.auth = (username, api_key) - -class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor): - """Extractor for images from danbooru based on search-tags""" - pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)" + def skip(self, num): + pages = num // self.per_page + if pages >= self.page_limit: + pages = self.page_limit - 1 + self.page_start = pages + 1 + return pages * self.per_page + + def items(self): + data = self.metadata() + for post in self.posts(): + try: + url = post["file_url"] + except KeyError: + continue + + text.nameext_from_url(url, post) + if post["extension"] == "zip": + if self.ugoira: + post["frames"] = self.request( + "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format( + self.root, post["id"]) + ).json()["pixiv_ugoira_frame_data"]["data"] + else: + url = post["large_file_url"] + post["extension"] = "webm" + + post.update(data) + yield Message.Directory, post + yield Message.Url, url, post + + def metadata(self): + return {} + + def posts(self): + return self._pagination(self.root + "/posts.json") + + def _pagination(self, url, pagenum=False): + params = self.params.copy() + params["limit"] = self.per_page + params["page"] = self.page_start + + while True: + posts = self.request(url, params=params).json() + yield from posts + + if len(posts) < self.per_page: + return + + if pagenum: + params["page"] += 1 + else: + params["page"] = "b{}".format(posts[-1]["id"]) + + +class DanbooruTagExtractor(DanbooruExtractor): + """Extractor for danbooru posts from tag searches""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=([^&#]+)" test = ( ("https://danbooru.donmai.us/posts?tags=bonocho", { "content": "b196fb9f1668109d7774a0a82efea3ffdda07746", }), # test page transitions - ("https://danbooru.donmai.us/posts?tags=canvas_%28cocktail_soft%29", { - "count": ">= 50", + ("https://danbooru.donmai.us/posts?tags=mushishi", { + "count": ">= 300", }), ("https://hijiribe.donmai.us/posts?tags=bonocho"), ("https://sonohara.donmai.us/posts?tags=bonocho"), ("https://safebooru.donmai.us/posts?tags=bonocho"), ) + def __init__(self, match): + DanbooruExtractor.__init__(self, match) + self.params["tags"] = text.unquote(match.group(2).replace("+", " ")) + + def metadata(self): + return {"search_tags": self.params["tags"]} + -class DanbooruPoolExtractor(booru.PoolMixin, DanbooruExtractor): - """Extractor for image-pools from danbooru""" - pattern = BASE_PATTERN + r"/pools/(?P<pool>\d+)" +class DanbooruPoolExtractor(DanbooruExtractor): + """Extractor for posts from danbooru pools""" + subcategory = "pool" + directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}") + archive_fmt = "p_{pool[id]}_{id}" + pattern = BASE_PATTERN + r"/pools/(\d+)" test = ("https://danbooru.donmai.us/pools/7659", { "content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99", }) - -class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor): - """Extractor for single images from danbooru""" - pattern = BASE_PATTERN + r"/posts/(?P<post>\d+)" + def __init__(self, match): + DanbooruExtractor.__init__(self, match) + self.pool_id = match.group(2) + self.params["tags"] = "pool:" + self.pool_id + + def metadata(self): + url = "{}/pools/{}.json".format(self.root, self.pool_id) + pool = self.request(url).json() + pool["name"] = pool["name"].replace("_", " ") + del pool["post_ids"] + return {"pool": pool} + + +class DanbooruPostExtractor(DanbooruExtractor): + """Extractor for single danbooru posts""" + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/posts/(\d+)" test = ( ("https://danbooru.donmai.us/posts/294929", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", @@ -74,20 +159,47 @@ class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor): }) ) + def __init__(self, match): + DanbooruExtractor.__init__(self, match) + self.post_id = match.group(2) -class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor): + def posts(self): + url = "{}/posts/{}.json".format(self.root, self.post_id) + return (self.request(url).json(),) + + +class DanbooruPopularExtractor(DanbooruExtractor): """Extractor for popular images from danbooru""" - pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?(?P<query>[^#]*))?" + subcategory = "popular" + directory_fmt = ("{category}", "popular", "{scale}", "{date}") + archive_fmt = "P_{scale[0]}_{date}_{id}" + pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?" test = ( ("https://danbooru.donmai.us/explore/posts/popular"), (("https://danbooru.donmai.us/explore/posts/popular" - "?date=2013-06-06+03%3A34%3A22+-0400&scale=week"), { - "count": ">= 1", + "?date=2013-06-06&scale=week"), { + "range": "1-120", + "count": 120, }), ) def __init__(self, match): - super().__init__(match) - urlfmt = "{scheme}://{subdomain}.donmai.us/explore/posts/popular.json" - self.api_url = urlfmt.format( - scheme=self.scheme, subdomain=self.subdomain) + DanbooruExtractor.__init__(self, match) + self.params.update(text.parse_query(match.group(2))) + + def metadata(self): + self.page_start = self.page_start or 1 + scale = self.params.get("scale", "day") + date = self.params.get("date") or datetime.date.today().isoformat() + + if scale == "week": + date = datetime.date.fromisoformat(date) + date = (date - datetime.timedelta(days=date.weekday())).isoformat() + elif scale == "month": + date = date[:-3] + + return {"date": date, "scale": scale} + + def posts(self): + url = self.root + "/explore/posts/popular.json" + return self._pagination(url, True) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 02a14e3..90b27d1 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -93,9 +93,11 @@ class DeviantartExtractor(Extractor): if content["src"].startswith("https://images-wixmp-"): if deviation["index"] <= 790677560: # https://github.com/r888888888/danbooru/issues/4069 - content["src"] = re.sub( + intermediary, count = re.subn( r"(/f/[^/]+/[^/]+)/v\d+/.*", r"/intermediary\1", content["src"]) + if count and self._check_url(intermediary): + content["src"] = intermediary if self.quality: content["src"] = re.sub( r"q_\d+", self.quality, content["src"]) @@ -261,6 +263,9 @@ class DeviantartExtractor(Extractor): if mtype and mtype.startswith("image/"): content.update(data) + def _check_url(self, url): + return self.request(url, method="HEAD", fatal=False).status_code < 400 + class DeviantartUserExtractor(DeviantartExtractor): """Extractor for an artist's user profile""" @@ -717,7 +722,7 @@ class DeviantartExtractorV2(DeviantartExtractor): # select largest video target = max(media["types"], key=lambda x: text.parse_int(x.get("q", "")[:-1])) - src = target["s"] + src = target["b"] elif target["t"] == "flash": src = target["s"] @@ -737,8 +742,10 @@ class DeviantartExtractorV2(DeviantartExtractor): if src.startswith("https://images-wixmp-"): if deviation["index"] <= 790677560: # https://github.com/r888888888/danbooru/issues/4069 - src = re.sub( + intermediary, count = re.subn( r"(/f/[^/]+/[^/]+)/v\d+/.*", r"/intermediary\1", src) + if count and self._check_url(intermediary): + src = intermediary if self.quality: src = re.sub(r"q_\d+", self.quality, src) @@ -811,15 +818,17 @@ class DeviantartDeviationExtractor(DeviantartExtractorV2): }), # video ("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", { - "url": "3b6e6e761d2d393fa61a4dc3ed6e7db51b14d07b", + "pattern": r"https://wixmp-.+wixmp.com/v/mp4/.+\.720p\.\w+.mp4", "keyword": { "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5", "extension": "mp4", "target": { "d": 306, - "f": 9963639, - "q": "1080p", + "f": 19367585, + "h": 720, + "q": "720p", "t": "video", + "w": 1364, "src": str, }, } @@ -952,11 +961,15 @@ class DeviantartAPI(): self.folders = extractor.config("folders", False) self.metadata = extractor.extra or extractor.config("metadata", False) - self.refresh_token = extractor.config("refresh-token") - self.client_id = extractor.config("client-id", self.CLIENT_ID) + self.client_id = extractor.config( + "client-id", self.CLIENT_ID) self.client_secret = extractor.config( "client-secret", self.CLIENT_SECRET) + self.refresh_token = extractor.config("refresh-token") + if self.refresh_token == "cache": + self.refresh_token = "#" + str(self.client_id) + self.log.debug( "Using %s API credentials (client-id %s)", "default" if self.client_id == self.CLIENT_ID else "custom", @@ -1026,8 +1039,12 @@ class DeviantartAPI(): "type" : kind, "include_session": "false", } - return self.extractor.request( - url, headers=headers, params=params, fatal=None).json() + response = self.extractor.request( + url, headers=headers, params=params, fatal=None) + if response.status_code == 404: + raise exception.StopExtraction( + "Your account must use the Eclipse interface.") + return response.json() def deviation_metadata(self, deviations): """ Fetch deviation metadata for a set of deviations""" diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index f245ddf..bc3f67a 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -1,71 +1,193 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://e621.net/""" +"""Extractors for https://e621.net/""" -from . import booru +from .common import Extractor, Message, SharedConfigMixin +from .. import text +import datetime +import time -class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor): +BASE_PATTERN = r"(?:https?://)?e(621|926)\.net" + + +class E621Extractor(SharedConfigMixin, Extractor): """Base class for e621 extractors""" + basecategory = "booru" category = "e621" - api_url = "https://e621.net/post/index.json" - post_url = "https://e621.net/post/show/{}" + filename_fmt = "{category}_{id}_{file[md5]}.{extension}" page_limit = 750 + page_start = None + per_page = 200 + _last_request = 0 + + def __init__(self, match): + Extractor.__init__(self, match) + self.root = "https://e{}.net".format(match.group(1)) + self.params = {} + + username, api_key = self._get_auth_info() + if username: + self.log.debug("Using HTTP Basic Auth for user '%s'", username) + self.session.auth = (username, api_key) + + def request(self, url, **kwargs): + diff = time.time() - E621Extractor._last_request + if diff < 1.0: + self.log.debug("Sleeping for %s seconds", diff) + time.sleep(diff) + kwargs["headers"] = {"User-Agent": "gallery-dl/1.13.0 (by mikf)"} + response = Extractor.request(self, url, **kwargs) + E621Extractor._last_request = time.time() + return response + + def items(self): + data = self.metadata() + for post in self.posts(): + file = post["file"] + + if not file["url"]: + ihash = file["md5"] + file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format( + self.root[8:], ihash[0:2], ihash[2:4], ihash, file["ext"]) + + post["filename"] = file["md5"] + post["extension"] = file["ext"] + post.update(data) + yield Message.Directory, post + yield Message.Url, file["url"], post + + def metadata(self): + return {} + def posts(self): + return self._pagination(self.root + "/posts.json") -class E621TagExtractor(booru.TagMixin, E621Extractor): - """Extractor for images from e621.net based on search-tags""" - pattern = (r"(?:https?://)?(?:www\.)?e621\.net/post" - r"(?:/index/\d+/|\?tags=)(?P<tags>[^/?&#]+)") + def _pagination(self, url): + params = self.params.copy() + params["limit"] = self.per_page + tags = params.get("tags", "") + + while True: + posts = self.request(url, params=params).json()["posts"] + yield from posts + + if len(posts) < self.per_page: + return + params["tags"] = "id:<{} {}".format(posts[-1]["id"], tags) + + +class E621TagExtractor(E621Extractor): + """Extractor for e621 posts from tag searches""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/posts?(?:\?.*?tags=|/index/\d+/)([^&#]+)" test = ( - ("https://e621.net/post/index/1/anry", { + ("https://e621.net/posts?tags=anry", { "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba", "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58", }), + ("https://e926.net/posts?tags=anry"), + ("https://e621.net/post/index/1/anry"), ("https://e621.net/post?tags=anry"), ) + def __init__(self, match): + E621Extractor.__init__(self, match) + self.params["tags"] = text.unquote(match.group(2).replace("+", " ")) + + def metadata(self): + return {"search_tags": self.params["tags"]} + -class E621PoolExtractor(booru.PoolMixin, E621Extractor): - """Extractor for image-pools from e621.net""" - pattern = r"(?:https?://)?(?:www\.)?e621\.net/pool/show/(?P<pool>\d+)" - test = ("https://e621.net/pool/show/73", { - "url": "842f2fb065c7c339486a9b1d689020b8569888ed", - "content": "c2c87b7a9150509496cddc75ccab08109922876a", - }) - - -class E621PostExtractor(booru.PostMixin, E621Extractor): - """Extractor for single images from e621.net""" - pattern = r"(?:https?://)?(?:www\.)?e621\.net/post/show/(?P<post>\d+)" - test = ("https://e621.net/post/show/535", { - "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529", - "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", - "options": (("tags", True),), - "keyword": { - "tags_artist": "anry", - "tags_general": str, - "tags_species": str, - }, - }) - - -class E621PopularExtractor(booru.MoebooruPopularMixin, E621Extractor): - """Extractor for popular images from 621.net""" - pattern = (r"(?:https?://)?(?:www\.)?e621\.net" - r"/post/popular_by_(?P<scale>day|week|month)" - r"(?:\?(?P<query>[^#]*))?") - test = ("https://e621.net/post/popular_by_month?month=6&year=2013", { - "count": 32, - }) +class E621PoolExtractor(E621Extractor): + """Extractor for e621 pools""" + subcategory = "pool" + directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}") + archive_fmt = "p_{pool[id]}_{id}" + pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)" + test = ( + ("https://e621.net/pools/73", { + "url": "842f2fb065c7c339486a9b1d689020b8569888ed", + "content": "c2c87b7a9150509496cddc75ccab08109922876a", + }), + ("https://e621.net/pool/show/73"), + ) def __init__(self, match): - super().__init__(match) - self.api_url = "https://e621.net/post/popular_by_{scale}.json".format( - scale=self.scale) + E621Extractor.__init__(self, match) + self.pool_id = match.group(2) + self.params["tags"] = "pool:" + self.pool_id + + def metadata(self): + url = "{}/pools/{}.json".format(self.root, self.pool_id) + pool = self.request(url).json() + pool["name"] = pool["name"].replace("_", " ") + del pool["post_ids"] + return {"pool": pool} + + +class E621PostExtractor(E621Extractor): + """Extractor for single e621 posts""" + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)" + test = ( + ("https://e621.net/posts/535", { + "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529", + "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", + }), + ("https://e621.net/post/show/535"), + ) + + def __init__(self, match): + E621Extractor.__init__(self, match) + self.post_id = match.group(2) + + def posts(self): + url = "{}/posts/{}.json".format(self.root, self.post_id) + return (self.request(url).json()["post"],) + + +class E621PopularExtractor(E621Extractor): + """Extractor for popular images from e621""" + subcategory = "popular" + directory_fmt = ("{category}", "popular", "{scale}", "{date}") + archive_fmt = "P_{scale[0]}_{date}_{id}" + pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?" + test = ( + ("https://e621.net/explore/posts/popular"), + (("https://e621.net/explore/posts/popular" + "?date=2019-06-01&scale=month"), { + "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+", + "count": ">= 70", + }) + ) + + def __init__(self, match): + E621Extractor.__init__(self, match) + self.params.update(text.parse_query(match.group(2))) + + def metadata(self): + scale = self.params.get("scale", "day") + date = self.params.get("date") or datetime.date.today().isoformat() + date = date[:10] + + if scale == "week": + date = datetime.date.fromisoformat(date) + date = (date - datetime.timedelta(days=date.weekday())).isoformat() + elif scale == "month": + date = date[:-3] + + return {"date": date, "scale": scale} + + def posts(self): + url = self.root + "/explore/posts/popular.json" + return self._pagination(url) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 967fd9c..a9d3c9d 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -61,7 +61,8 @@ class FlickrImageExtractor(FlickrExtractor): test = ( ("https://www.flickr.com/photos/departingyyz/16089302239", { "pattern": pattern, - "content": "0821a28ee46386e85b02b67cf2720063440a228c", + "content": ("3133006c6d657fe54cf7d4c46b82abbcb0efaf9f", + "0821a28ee46386e85b02b67cf2720063440a228c"), "keyword": { "comments": int, "description": str, diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py new file mode 100644 index 0000000..ba60e19 --- /dev/null +++ b/gallery_dl/extractor/furaffinity.py @@ -0,0 +1,235 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.furaffinity.net/""" + +from .common import Extractor, Message +from .. import text, util + + +BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net" + + +class FuraffinityExtractor(Extractor): + """Base class for furaffinity extractors""" + category = "furaffinity" + directory_fmt = ("{category}", "{user!l}") + filename_fmt = "{id} {title}.{extension}" + archive_fmt = "{id}" + cookiedomain = ".furaffinity.net" + root = "https://www.furaffinity.net" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + self.offset = 0 + + def items(self): + for post_id in util.advance(self.posts(), self.offset): + post = self._parse_post(post_id) + if post: + yield Message.Directory, post + yield Message.Url, post["url"], post + + def posts(self): + return self._pagination() + + def skip(self, num): + self.offset += num + return num + + def _parse_post(self, post_id): + url = "{}/view/{}/".format(self.root, post_id) + extr = text.extract_from(self.request(url).text) + title, _, artist = text.unescape(extr( + 'property="og:title" content="', '"')).rpartition(" by ") + path = extr('href="//d.facdn.net/', '"') + + if not path: + self.log.warning( + "Unable to download post %s (\"%s\")", + post_id, text.remove_html( + extr('System Message', '</section>') or + extr('System Message', '</table>') + ) + ) + return None + + pi = text.parse_int + rh = text.remove_html + + data = text.nameext_from_url(path, { + "id" : pi(post_id), + "title" : title, + "artist": artist, + "user" : self.user or artist, + "url" : "https://d.facdn.net/" + path + }) + + tags = extr('class="tags-row">', '</section>') + if tags: + # new site layout + data["tags"] = text.split_html(tags) + data["description"] = text.unescape(rh(extr( + 'class="section-body">', '</div>'), "", "")) + data["views"] = pi(rh(extr('class="views">', '</span>'))) + data["favorites"] = pi(rh(extr('class="favorites">', '</span>'))) + data["comments"] = pi(rh(extr('class="comments">', '</span>'))) + data["rating"] = rh(extr('class="rating">', '</span>')) + data["fa_category"] = rh(extr('>Category</strong>', '</span>')) + data["theme"] = rh(extr('>', '<')) + data["species"] = rh(extr('>Species</strong>', '</div>')) + data["gender"] = rh(extr('>Gender</strong>', '</div>')) + data["width"] = pi(extr("<span>", "x")) + data["height"] = pi(extr("", "p")) + else: + # old site layout + data["fa_category"] = extr("<b>Category:</b>", "<").strip() + data["theme"] = extr("<b>Theme:</b>", "<").strip() + data["species"] = extr("<b>Species:</b>", "<").strip() + data["gender"] = extr("<b>Gender:</b>", "<").strip() + data["favorites"] = pi(extr("<b>Favorites:</b>", "<")) + data["comments"] = pi(extr("<b>Comments:</b>", "<")) + data["views"] = pi(extr("<b>Views:</b>", "<")) + data["width"] = pi(extr("<b>Resolution:</b>", "x")) + data["height"] = pi(extr("", "<")) + data["tags"] = text.split_html(extr( + 'id="keywords">', '</div>'))[::2] + data["rating"] = extr('<img alt="', ' ') + data["description"] = text.unescape(text.remove_html(extr( + "</table>", "</table>"), "", "")) + data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) + + return data + + def _pagination(self): + num = 1 + + while True: + url = "{}/{}/{}/{}/".format( + self.root, self.subcategory, self.user, num) + page = self.request(url).text + post_id = None + + for post_id in text.extract_iter(page, 'id="sid-', '"'): + yield post_id + + if not post_id: + return + num += 1 + + def _pagination_favorites(self): + path = "/favorites/{}/".format(self.user) + + while path: + page = self.request(self.root + path).text + yield from text.extract_iter(page, 'id="sid-', '"') + path = text.extract(page, 'right" href="', '"')[0] + + +class FuraffinityGalleryExtractor(FuraffinityExtractor): + """Extractor for a furaffinity user's gallery""" + subcategory = "gallery" + pattern = BASE_PATTERN + r"/gallery/([^/?&#]+)" + test = ("https://www.furaffinity.net/gallery/mirlinthloth/", { + "pattern": r"https://d.facdn.net/art/mirlinthloth/\d+/\d+.\w+\.\w+", + "range": "45-50", + "count": 6, + }) + + +class FuraffinityScrapsExtractor(FuraffinityExtractor): + """Extractor for a furaffinity user's scraps""" + subcategory = "scraps" + directory_fmt = ("{category}", "{user!l}", "Scraps") + pattern = BASE_PATTERN + r"/scraps/([^/?&#]+)" + test = ("https://www.furaffinity.net/scraps/mirlinthloth/", { + "pattern": r"https://d.facdn.net/art/[^/]+(/stories)?/\d+/\d+.\w+.\w+", + "count": ">= 3", + }) + + +class FuraffinityFavoriteExtractor(FuraffinityExtractor): + """Extractor for a furaffinity user's favorites""" + subcategory = "favorite" + directory_fmt = ("{category}", "{user!l}", "Favorites") + pattern = BASE_PATTERN + r"/favorites/([^/?&#]+)" + test = ("https://www.furaffinity.net/favorites/mirlinthloth/", { + "pattern": r"https://d.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+", + "range": "45-50", + "count": 6, + }) + + def posts(self): + return self._pagination_favorites() + + +class FuraffinityPostExtractor(FuraffinityExtractor): + """Extractor for individual posts on furaffinity""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(?:view|full)/(\d+)" + test = ( + ("https://www.furaffinity.net/view/21835115/", { + "url": "eae4ef93d99365c69b31a37561bd800c03d336ad", + "keyword": { + "artist" : "mirlinthloth", + "date" : "dt:2016-11-27 17:24:06", + "description": "A Song made playing the game Cosmic DJ.", + "extension" : "mp3", + "filename" : r"re:\d+\.\w+_dj_fennmink_-_bude_s_4_ever", + "id" : 21835115, + "tags" : list, + "title" : "Bude's 4 Ever", + "url" : "re:https://d.facdn.net/art/mirlinthloth/music", + "user" : "mirlinthloth", + "views" : int, + "favorites" : int, + "comments" : int, + "rating" : "General", + "fa_category": "Music", + "theme" : "All", + "species" : "Unspecified / Any", + "gender" : "Any", + "width" : 120, + "height" : 120, + }, + }), + ("https://furaffinity.net/view/21835115/"), + ("https://sfw.furaffinity.net/view/21835115/"), + ("https://www.furaffinity.net/full/21835115/"), + ) + + def posts(self): + post_id = self.user + self.user = None + return (post_id,) + + +class FuraffinityUserExtractor(FuraffinityExtractor): + """Extractor for furaffinity user profiles""" + subcategory = "user" + cookiedomain = None + pattern = BASE_PATTERN + r"/user/([^/?&#]+)" + test = ( + ("https://www.furaffinity.net/user/mirlinthloth/", { + "pattern": r"/gallery/mirlinthloth/$", + }), + ("https://www.furaffinity.net/user/mirlinthloth/", { + "options": (("include", "all"),), + "pattern": r"/(gallery|scraps|favorites)/mirlinthloth/$", + "count": 3, + }), + ) + + def items(self): + base = "{}/{{}}/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (FuraffinityGalleryExtractor , base.format("gallery")), + (FuraffinityScrapsExtractor , base.format("scraps")), + (FuraffinityFavoriteExtractor, base.format("favorites")), + ), ("gallery",)) diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 19f9481..6e82091 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -247,7 +247,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): "content": "91bf01497c39254b6dfb234a18e8f01629c77fd1", "keyword": { "artist" : "Tenpura", - "date" : "type:datetime", + "date" : "dt:2016-02-22 14:41:19", "description": "Thank you!", "height" : 700, "index" : 407501, diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py new file mode 100644 index 0000000..302999b --- /dev/null +++ b/gallery_dl/extractor/hentaihand.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentaihand.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import collections + + +class HentaihandGalleryExtractor(GalleryExtractor): + """Extractor for image galleries on hentaihand.com""" + category = "hentaihand" + root = "https://hentaihand.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com" + r"/(?:comi|view)c/(\d+)") + test = ( + ("https://hentaihand.com/comic/272772/kouda-tomohiro-chiyomi-bl", { + "pattern": r"https://i.hentaihand.com/.*/images/full/\d+.jpg$", + "count": 19, + "keyword": { + "artists" : ["kouda tomohiro"], + "categories": ["manga"], + "date" : "Feb. 6, 2020, 3:19 p.m.", + "gallery_id": 272772, + "lang" : "en", + "language" : "English", + "relationships": ["family", "step family"], + "tags" : list, + "title" : r"re:\[Kouda Tomohiro\] Chiyomi Blizzard", + "title_jp" : r"re:\[幸田朋弘\] ちよみブリザード", + }, + }), + ("https://hentaihand.com/viewc/272772/kouda-tomohiro-chiyomi-bl"), + ) + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/comic/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + + title_en = text.unescape(extr("<h1>", "<")) + title_jp = text.unescape(extr("<h2>", "<")) + tags = extr('<section id="tags"', "</section>") + + data = { + "gallery_id" : text.parse_int(self.gallery_id), + "title" : title_en or title_jp, + "title_en" : title_en, + "title_jp" : title_jp, + + # impossible to parse with strptime() + "date" : extr('datetime="', '"'), + } + + tdict = collections.defaultdict(list) + for path in text.extract_iter(tags, 'href="/', '"'): + kind, _, name = path.partition("/") + tdict[kind].append(name.replace("+", " ")) + data.update(tdict) + + if "languages" in data: + data["language"] = data["languages"][-1].capitalize() + data["lang"] = util.language_to_code(data["language"]) + del data["languages"] + return data + + def images(self, _): + url = "{}/viewc/{}/1".format(self.root, self.gallery_id) + page = self.request(url).text + images = text.extract(page, "var images", ";")[0] + return [(img, None) for img in text.extract_iter(images, "'", "'")] + + +class HentaihandTagExtractor(Extractor): + """Extractor for tag searches on hentaihand.com""" + category = "hentaihand" + subcategory = "tag" + root = "https://hentaihand.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com" + r"(/(?:parody|characters|tags|artists|groups|languages" + r"|categories|relationships)/[^#]+)") + test = ( + ("https://hentaihand.com/artists/tony+taka", { + "pattern": HentaihandGalleryExtractor.pattern, + "count": ">= 50", + }), + ("https://hentaihand.com/artists/tony+taka/popular?page=2"), + ("https://hentaihand.com/tags/full+color"), + ("https://hentaihand.com/languages/japanese"), + ("https://hentaihand.com/categories/manga"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.path, _, query = match.group(1).partition("?") + self.query = text.parse_query(query) + self.query["page"] = text.parse_int(self.query.get("page"), 1) + + def items(self): + yield Message.Version, 1 + url = self.root + self.path + params = self.query.copy() + data = {"_extractor": HentaihandGalleryExtractor} + + while True: + page = self.request(url, params=params).text + + for path in text.extract_iter(page, '<a href="/comic/', '"'): + yield Message.Queue, self.root + "/comic/" + path, data + + pos = page.find(">(current)<") + if pos < 0 or page.find('class="page-link" href="', pos) < 0: + break + params["page"] += 1 + + +class HentaihandSearchExtractor(HentaihandTagExtractor): + """Extractor for search results on hentaihand.com""" + subcategory = "search" + pattern = r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com(/search/?[^#]+)" + test = ("https://hentaihand.com/search?q=color", { + "pattern": HentaihandGalleryExtractor.pattern, + "range": "1-50", + "count": 50, + }) diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 193cadf..ad97eba 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -22,7 +22,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): test = ( ("https://hentainexus.com/view/5688", { "url": "746d0043e20030f1171aae5ea113176607302517", - "keyword": "c1b7091e2bc2f733f6401711e072ad11cf93dd69", + "keyword": "77702b42f8f76ecfe5d8a14cfbbcbd855eb14d7f", }), ("https://hentainexus.com/read/5688"), ) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py new file mode 100644 index 0000000..e0b0f50 --- /dev/null +++ b/gallery_dl/extractor/hiperdex.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hiperdex.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +from ..cache import memcache +import re + + +class HiperdexBase(): + """Base class for hiperdex extractors""" + category = "hiperdex" + root = "https://hiperdex.com" + + @memcache(keyarg=1) + def manga_data(self, manga, page=None): + if not page: + url = "{}/manga/{}/".format(self.root, manga) + page = self.request(url).text + extr = text.extract_from(page) + + return { + "manga" : text.unescape(extr( + "<title>", "<").rpartition("&")[0].strip()), + "score" : text.parse_float(extr( + 'id="averagerate">', '<')), + "author" : text.remove_html(extr( + 'class="author-content">', '</div>')), + "artist" : text.remove_html(extr( + 'class="artist-content">', '</div>')), + "genre" : text.split_html(extr( + 'class="genres-content">', '</div>'))[::2], + "type" : extr( + 'class="summary-content">', '<').strip(), + "release": text.parse_int(text.remove_html(extr( + 'class="summary-content">', '</div>'))), + "status" : extr( + 'class="summary-content">', '<').strip(), + "description": text.remove_html(text.unescape(extr( + 'class="description-summary">', '</div>'))), + "language": "English", + "lang" : "en", + } + + def chapter_data(self, chapter): + chapter, _, minor = chapter.partition("-") + data = { + "chapter" : text.parse_int(chapter), + "chapter_minor": "." + minor if minor and minor != "end" else "", + } + data.update(self.manga_data(self.manga.lower())) + return data + + +class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): + """Extractor for manga chapters from hiperdex.com""" + pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.com" + r"(/manga/([^/?&#]+)/([^/?&#]+))") + test = ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", { + "url": "111bc3ee14ce91d78c275770ef63b56c9ac15d8d", + "keyword": { + "artist" : "Sasuga Kei", + "author" : "Sasuga Kei", + "chapter": 154, + "chapter_minor": ".5", + "description": "re:Natsuo Fujii is in love with his teacher, Hina", + "genre" : list, + "manga" : "Domestic na Kanojo", + "release": 2014, + "score" : float, + "type" : "Manga", + }, + }) + + def __init__(self, match): + path, self.manga, self.chapter = match.groups() + ChapterExtractor.__init__(self, match, self.root + path + "/") + + def metadata(self, _): + return self.chapter_data(self.chapter) + + def images(self, page): + return [ + (url.strip(), None) + for url in re.findall(r'id="image-\d+"\s+src="([^"]+)', page) + ] + + +class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): + """Extractor for manga from hiperdex.com""" + chapterclass = HiperdexChapterExtractor + pattern = r"(?:https?://)?(?:www\.)?hiperdex\.com(/manga/([^/?&#]+))/?$" + test = ("https://hiperdex.com/manga/youre-not-that-special/", { + "count": 51, + "pattern": HiperdexChapterExtractor.pattern, + "keyword": { + "artist" : "Bolp", + "author" : "Abyo4", + "chapter": int, + "chapter_minor": "", + "description": "re:I didn’t think much of the creepy girl in ", + "genre" : list, + "manga" : "You're Not That Special!", + "release": 2019, + "score" : float, + "status" : "Completed", + "type" : "Manhwa", + }, + }) + + def __init__(self, match): + path, self.manga = match.groups() + MangaExtractor.__init__(self, match, self.root + path + "/") + + def chapters(self, page): + self.manga_data(self.manga, page) + results = [] + last = None + + page = text.extract(page, 'class="page-content-listing', '</ul>')[0] + for match in HiperdexChapterExtractor.pattern.finditer(page): + path = match.group(1) + if last != path: + last = path + results.append(( + self.root + path, + self.chapter_data(path.rpartition("/")[2]), + )) + + return results diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index d6fdcf2..3baf819 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://hitomi.la/""" +"""Extractors for https://hitomi.la/""" from .common import GalleryExtractor from .. import text, util @@ -27,21 +27,25 @@ class HitomiGalleryExtractor(GalleryExtractor): "keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2", "count": 16, }), + # download test ("https://hitomi.la/galleries/1401410.html", { - # download test "range": "1", "content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c", }), + # Game CG with scenes (#321) ("https://hitomi.la/galleries/733697.html", { - # Game CG with scenes (#321) - "url": "21064f9e3c244aca87f1a91967a3fbe79032c4ce", + "url": "b4cbc76032852db4a655bf6a2c4d58eae8153c8e", "count": 210, }), + # fallback for galleries only available through /reader/ URLs ("https://hitomi.la/galleries/1045954.html", { - # fallback for galleries only available through /reader/ URLs - "url": "0a67f5e6c3c6a384b578e328f4817fa6ccdf856a", + "url": "f3aa914ad148437f72d307268fa0d250eabe8dab", "count": 1413, }), + # gallery with "broken" redirect + ("https://hitomi.la/cg/scathacha-sama-okuchi-ecchi-1291900.html", { + "count": 10, + }), ("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"), ("https://hitomi.la/manga/867789.html"), ("https://hitomi.la/doujinshi/867789.html"), @@ -51,84 +55,90 @@ class HitomiGalleryExtractor(GalleryExtractor): ) def __init__(self, match): - self.gallery_id = match.group(1) - self.fallback = False - url = "{}/galleries/{}.html".format(self.root, self.gallery_id) + gid = match.group(1) + url = "https://ltn.hitomi.la/galleries/{}.js".format(gid) GalleryExtractor.__init__(self, match, url) + self.info = None + self.session.headers["Referer"] = "{}/reader/{}.html".format( + self.root, gid) + + def metadata(self, page): + self.info = info = json.loads(page.partition("=")[2]) + + data = self._data_from_gallery_info(info) + if self.config("metadata", True): + data.update(self._data_from_gallery_page(info)) + return data + + def _data_from_gallery_info(self, info): + language = info.get("language") + if language: + language = language.capitalize() + + tags = [] + for tinfo in info["tags"]: + tag = tinfo["tag"] + if tinfo.get("female"): + tag += " ♀" + elif tinfo.get("male"): + tag += " ♂" + tags.append(string.capwords(tag)) + + return { + "gallery_id": text.parse_int(info["id"]), + "title" : info["title"], + "type" : info["type"].capitalize(), + "language" : language, + "lang" : util.language_to_code(language), + "tags" : tags, + "date" : text.parse_datetime( + info["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"), + } + + def _data_from_gallery_page(self, info): + url = "{}/galleries/{}.html".format(self.root, info["id"]) - def request(self, url, **kwargs): - response = GalleryExtractor.request(self, url, fatal=False, **kwargs) - if response.status_code == 404: - self.fallback = True - url = url.replace("/galleries/", "/reader/") - response = GalleryExtractor.request(self, url, **kwargs) - elif b"<title>Redirect</title>" in response.content: + # follow redirects + while True: + response = self.request(url, fatal=False) + if b"<title>Redirect</title>" not in response.content: + break url = text.extract(response.text, "href='", "'")[0] if not url.startswith("http"): url = text.urljoin(self.root, url) - response = self.request(url, **kwargs) - return response - def metadata(self, page): - if self.fallback: - return { - "gallery_id": text.parse_int(self.gallery_id), - "title": text.unescape(text.extract( - page, "<title>", "<")[0].rpartition(" | ")[0]), - } - - extr = text.extract_from(page, page.index('<h1><a href="/reader/')) - data = { - "gallery_id": text.parse_int(self.gallery_id), - "title" : text.unescape(extr('.html">', '<').strip()), - "artist" : self._prep(extr('<h2>', '</h2>')), - "group" : self._prep(extr('<td>Group</td><td>', '</td>')), - "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')), - "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')), - "parody" : self._prep(extr('<td>Series</td><td>', '</td>')), - "characters": self._prep(extr('<td>Characters</td><td>', '</td>')), - "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')), - "date" : self._date(extr('<span class="date">', '</span>')), + if response.status_code >= 400: + return {} + + def prep(value): + return [ + text.unescape(string.capwords(v)) + for v in text.extract_iter(value or "", '.html">', '<') + ] + + extr = text.extract_from(response.text) + return { + "artist" : prep(extr('<h2>', '</h2>')), + "group" : prep(extr('<td>Group</td><td>', '</td>')), + "parody" : prep(extr('<td>Series</td><td>', '</td>')), + "characters": prep(extr('<td>Characters</td><td>', '</td>')), } - if data["language"] == "N/a": - data["language"] = None - data["lang"] = util.language_to_code(data["language"]) - return data - - def images(self, page): - # set Referer header before image downloads (#239) - self.session.headers["Referer"] = self.gallery_url - - # get 'galleryinfo' - url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id) - page = self.request(url).text + def images(self, _): result = [] - for image in json.loads(page.partition("=")[2]): + for image in self.info["files"]: ihash = image["hash"] idata = text.nameext_from_url(image["name"]) # see https://ltn.hitomi.la/common.js - offset = int(ihash[-3:-1], 16) % 3 + inum = int(ihash[-3:-1], 16) + frontends = 2 if inum < 0x30 else 3 + inum = 1 if inum < 0x09 else inum + url = "https://{}a.hitomi.la/images/{}/{}/{}.{}".format( - chr(97 + offset), + chr(97 + (inum % frontends)), ihash[-1], ihash[-3:-1], ihash, idata["extension"], ) result.append((url, idata)) return result - - @staticmethod - def _prep(value): - return [ - text.unescape(string.capwords(v)) - for v in text.extract_iter(value or "", '.html">', '<') - ] - - @staticmethod - def _prep_1(value): - return text.remove_html(value).capitalize() - - @staticmethod - def _date(value): - return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z") diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 15152b7..d0aa4f2 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -111,13 +111,13 @@ class ImgbbAlbumExtractor(ImgbbExtractor): test = ( ("https://ibb.co/album/i5PggF", { "range": "1-80", - "url": "570872b6eb3e11cf10b618922b780fed204c3f09", - "keyword": "0f2fc956728c36540c577578bd168d2459d6ae4b", + "url": "70afec9fcc3a6de62a6b644b487d892d8d47cf1a", + "keyword": "569e1d88ebdd27655387559cdf1cd526a3e1ab69", }), ("https://ibb.co/album/i5PggF?sort=title_asc", { "range": "1-80", - "url": "e2e387b8fdb3690bd75d804d0af2833112e385cd", - "keyword": "a307fc9d2085bdc0eb7c538c8d866c59198d460c", + "url": "a2dfc58fe3348fa37e242082bd5a85eaa490d0a5", + "keyword": "5bb79c82411c3770d673fac64a0a98fa28111c3b", }), # no user data (#471) ("https://ibb.co/album/kYKpwF", { @@ -192,12 +192,12 @@ class ImgbbImageExtractor(ImgbbExtractor): subcategory = "image" pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)" test = ("https://ibb.co/fUqh5b", { - "pattern": "https://image.ibb.co/dY5FQb/Arundel-Ireeman-5.jpg", + "pattern": r"https://i\.ibb\.co/g3kvx80/Arundel-Ireeman-5\.jpg", "content": "c5a0965178a8b357acd8aa39660092918c63795e", "keyword": { "id" : "fUqh5b", "title" : "Arundel Ireeman 5", - "url" : "https://image.ibb.co/dY5FQb/Arundel-Ireeman-5.jpg", + "url" : "https://i.ibb.co/g3kvx80/Arundel-Ireeman-5.jpg", "width" : 960, "height": 719, "user" : "folkie", diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 5084e80..0813ea9 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://imgur.com/""" +"""Extractors for https://imgur.com/""" from .common import Extractor, Message from .. import text, exception @@ -65,7 +65,7 @@ class ImgurImageExtractor(ImgurExtractor): "account_url" : None, "animated" : False, "bandwidth" : int, - "date" : "type:datetime", + "date" : "dt:2016-11-10 14:24:35", "datetime" : 1478787875, "description" : None, "edited" : "0", @@ -142,7 +142,7 @@ class ImgurAlbumExtractor(ImgurExtractor): "cover_edited": None, "cover_height": 1400, "cover_width" : 951, - "date" : "type:datetime", + "date" : "dt:2015-10-09 10:37:50", "datetime" : 1444387070, "description" : None, "favorite" : False, diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 05adac1..96afea1 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Leonardo Taccari, Mike Fährmann +# Copyright 2018-2019 Leonardo Taccari +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.instagram.com/""" +"""Extractors for https://www.instagram.com/""" from .common import Extractor, Message from .. import text, exception @@ -129,6 +130,7 @@ class InstagramExtractor(Extractor): 'owner_id': media['owner']['id'], 'username': media['owner']['username'], 'fullname': media['owner']['full_name'], + "post_shortcode": media['shortcode'], 'description': text.parse_unicode_escapes('\n'.join( edge['node']['text'] for edge in media['edge_media_to_caption']['edges'] @@ -306,12 +308,13 @@ class InstagramImageExtractor(InstagramExtractor): r"/v(p/[0-9a-f]+/[0-9A-F]+)?/t51.2885-15/e35" r"/44877605_725955034447492_3123079845831750529_n.jpg", "keyword": { - "date": "type:datetime", + "date": "dt:2018-11-29 01:04:04", "description": str, "height": int, "likes": int, "media_id": "1922949326347663701", "shortcode": "BqvsDleB3lV", + "post_shortcode": "BqvsDleB3lV", "typename": "GraphImage", "username": "instagram", "width": int, @@ -324,6 +327,7 @@ class InstagramImageExtractor(InstagramExtractor): "keyword": { "sidecar_media_id": "1875629777499953996", "sidecar_shortcode": "BoHk1haB5tM", + "post_shortcode": "BoHk1haB5tM", "likes": int, "username": "instagram", } @@ -333,7 +337,7 @@ class InstagramImageExtractor(InstagramExtractor): ("https://www.instagram.com/p/Bqxp0VSBgJg/", { "pattern": r"/47129943_191645575115739_8539303288426725376_n\.mp4", "keyword": { - "date": "type:datetime", + "date": "dt:2018-11-29 19:23:58", "description": str, "height": int, "likes": int, @@ -349,7 +353,7 @@ class InstagramImageExtractor(InstagramExtractor): ("https://www.instagram.com/tv/BkQjCfsBIzi/", { "pattern": r"/10000000_1760663964018792_716207142595461120_n\.mp4", "keyword": { - "date": "type:datetime", + "date": "dt:2018-06-20 19:51:32", "description": str, "height": int, "likes": int, diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index 49d68ef..b34b288 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -35,7 +35,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): "access" : "public", "articleStories": list, "contentRating" : dict, - "date" : "type:datetime", + "date" : "dt:2019-09-16 00:00:00", "description" : "re:Motions, the brand new publication by I", "documentId" : r"re:\d+-d99ec95935f15091b040cb8060f05510", "documentName" : "motions-1-2019", diff --git a/gallery_dl/extractor/kabeuchi.py b/gallery_dl/extractor/kabeuchi.py new file mode 100644 index 0000000..a8702f1 --- /dev/null +++ b/gallery_dl/extractor/kabeuchi.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://kabe-uchiroom.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +class KabeuchiUserExtractor(Extractor): + """Extractor for all posts of a user on kabe-uchiroom.com""" + category = "kabeuchi" + subcategory = "user" + directory_fmt = ("{category}", "{twitter_user_id} {twitter_id}") + filename_fmt = "{id}_{num:>02}{title:?_//}.{extension}" + archive_fmt = "{id}_{num}" + root = "https://kabe-uchiroom.com" + pattern = r"(?:https?://)?kabe-uchiroom\.com/mypage/?\?id=(\d+)" + test = ( + ("https://kabe-uchiroom.com/mypage/?id=919865303848255493", { + "pattern": (r"https://kabe-uchiroom\.com/accounts/upfile/3/" + r"919865303848255493/\w+\.jpe?g"), + "count": ">= 24", + }), + ("https://kabe-uchiroom.com/mypage/?id=123456789", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user_id = match.group(1) + + def items(self): + base = "{}/accounts/upfile/{}/{}/".format( + self.root, self.user_id[-1], self.user_id) + keys = ("image1", "image2", "image3", "image4", "image5", "image6") + + for post in self.posts(): + if post.get("is_ad") or not post["image1"]: + continue + + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%d %H:%M:%S") + yield Message.Directory, post + + for key in keys: + name = post[key] + if not name: + break + url = base + name + post["num"] = ord(key[-1]) - 48 + yield Message.Url, url, text.nameext_from_url(name, post) + + def posts(self): + url = "{}/mypage/?id={}".format(self.root, self.user_id) + response = self.request(url) + if response.history and response.url == self.root + "/": + raise exception.NotFoundError("user") + target_id = text.extract(response.text, 'user_friend_id = "', '"')[0] + return self._pagination(target_id) + + def _pagination(self, target_id): + url = "{}/get_posts.php".format(self.root) + data = { + "user_id" : "0", + "target_id" : target_id, + "type" : "uploads", + "sort_type" : "0", + "category_id": "all", + "latest_post": "", + "page_num" : 0, + } + + while True: + info = self.request(url, method="POST", data=data).json() + datas = info["datas"] + + if not datas or not isinstance(datas, list): + return + yield from datas + + last_id = datas[-1]["id"] + if last_id == info["last_data"]: + return + data["latest_post"] = last_id + data["page_num"] += 1 diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index c9e6959..822a743 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract soundtracks from https://downloads.khinsider.com/""" +"""Extractors for https://downloads.khinsider.com/""" from .common import Extractor, Message, AsynchronousMixin from .. import text, exception @@ -16,54 +16,52 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): """Extractor for soundtracks from khinsider.com""" category = "khinsider" subcategory = "soundtrack" - directory_fmt = ("{category}", "{album}") - archive_fmt = "{album}_{filename}.{extension}" + directory_fmt = ("{category}", "{album[name]}") + archive_fmt = "{filename}.{extension}" pattern = (r"(?:https?://)?downloads\.khinsider\.com" r"/game-soundtracks/album/([^/?&#]+)") + root = "https://downloads.khinsider.com" test = (("https://downloads.khinsider.com" "/game-soundtracks/album/horizon-riders-wii"), { - "pattern": r"https?://\d+\.\d+\.\d+\.\d+/ost/horizon-riders-wii/[^/]+" - r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3", - "count": 1, - "keyword": "b4f460c78dd23e1f1121f4ac784dd67ded7c2679", + "pattern": r"https?://vgmdownloads.com/soundtracks/horizon-riders-wii/" + r"[^/]+/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3", + "keyword": "5b2c35cce638c326cab2a4f7a79f245d008d62ff", }) - root = "https://downloads.khinsider.com" def __init__(self, match): Extractor.__init__(self, match) self.album = match.group(1) def items(self): - url = (self.root + "/game-soundtracks/album/" + self.album) + url = self.root + "/game-soundtracks/album/" + self.album page = self.request(url, encoding="utf-8").text - data = self.get_job_metadata(page) + if "Download all songs at once:" not in page: + raise exception.NotFoundError("soundtrack") + + data = self.metadata(page) yield Message.Version, 1 yield Message.Directory, data - for url, track in self.get_album_tracks(page): + for track in self.tracks(page): track.update(data) - yield Message.Url, url, track + yield Message.Url, track["url"], track - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" - if "Download all songs at once:" not in page: - raise exception.NotFoundError("soundtrack") - data = text.extract_all(page, ( - ("album", "Album name: <b>", "</b>"), - ("count", "Number of Files: <b>", "</b>"), - ("size" , "Total Filesize: <b>", "</b>"), - ("date" , "Date added: <b>", "</b>"), - ("type" , "Album type: <b>", "</b>"), - ))[0] - data["album"] = text.unescape(data["album"]) - return data + def metadata(self, page): + extr = text.extract_from(page) + return {"album": { + "name" : text.unescape(extr("Album name: <b>", "<")), + "count": text.parse_int(extr("Number of Files: <b>", "<")), + "size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]), + "date" : extr("Date added: <b>", "<"), + "type" : extr("Album type: <b>", "<"), + }} - def get_album_tracks(self, page): - """Collect url and metadata for all tracks of a soundtrack""" + def tracks(self, page): page = text.extract(page, '<table id="songlist">', '</table>')[0] + for num, url in enumerate(text.extract_iter( page, '<td class="clickable-row"><a href="', '"'), 1): url = text.urljoin(self.root, url) page = self.request(url, encoding="utf-8").text - url = text.extract( - page, '<p><a style="color: #21363f;" href="', '"')[0] - yield url, text.nameext_from_url(url, {"num": num}) + + url = text.extract(page, 'style="color: #21363f;" href="', '"')[0] + yield text.nameext_from_url(url, {"num": num, "url": url}) diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index 7151de0..8809589 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -94,7 +94,7 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): }), ("https://kissmanga.com/Manga/Houseki-no-Kuni/Oneshot?id=404189", { "count": 49, - "keyword": "d44d1b21d08e4dbf888b0c450a3f1bc919588b4f", + "keyword": "cea131c9fe9c71309b3270cd86718d4d1198c31c", }), ("https://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608"), ) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index c80cf14..c31de1c 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -59,7 +59,7 @@ class LusciousAlbumExtractor(LusciousExtractor): "cover" : "re:https://\\w+.luscious.net/.+/277031/", "created" : 1479625853, "created_by" : "NTRshouldbeillegal", - "date" : "type:datetime", + "date" : "dt:2016-11-20 07:10:53", "description" : "Enjoy.", "download_url": "/download/824778/277031/", "genres" : list, diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index d24d452..31083dc 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -38,7 +38,7 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))" test = (("https://www.mangareader.net" "/karate-shoukoushi-kohinata-minoru/11"), { - "url": "061cc92a07edf17bb991ce0821fa4c77a147a860", + "url": "3d8a5b900856d59b8d8e83908d0df392be92c0f4", "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6", }) diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 114a48e..8cd7fa5 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,6 +12,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache import hashlib +import time class MangoxoExtractor(Extractor): @@ -35,28 +36,34 @@ class MangoxoExtractor(Extractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - page = self.request(self.root + "/login/").text - token = text.extract(page, 'id="loginToken" value="', '"')[0] - if not token: - self.log.debug("failed to extract 'loginToken'") - - url = self.root + "/login/loginxmm" + url = self.root + "/api/login" headers = { "X-Requested-With": "XMLHttpRequest", "Referer": self.root + "/login", } - data = { - "name": username, - "password": hashlib.md5(password.encode()).hexdigest(), - "loginToken": token, - } + data = self._sign_by_md5(username, password) response = self.request(url, method="POST", headers=headers, data=data) - if response.json().get("result") != "1": - raise exception.AuthenticationError() + data = response.json() + if str(data.get("result")) != "1": + raise exception.AuthenticationError(data.get("msg")) return {"SESSION": self.session.cookies.get("SESSION")} @staticmethod + def _sign_by_md5(username, password): + # https://dns.mangoxo.com/libs/plugins/phoenix-ui/js/phoenix-ui.js + params = [ + ("username" , username), + ("password" , password), + ("timestamp", str(int(time.time()))), + ] + query = "&".join("=".join(item) for item in sorted(params)) + query += "&secretKey=996293536" + sign = hashlib.md5(query.encode()).hexdigest() + params.append(("sign", sign.upper())) + return params + + @staticmethod def _total_pages(page): return text.parse_int(text.extract(page, "total :", ",")[0]) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 54e60b0..21afeae 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache +import itertools import json @@ -35,16 +36,17 @@ class NewgroundsExtractor(Extractor): for post_url in self.posts(): try: - file = self.extract_post(post_url) - url = file["url"] - # except Exception: + post = self.extract_post(post_url) + url = post.get("url") except OSError: url = None - if not url: - self.log.warning("Unable to get download URL for %s", post_url) - continue - yield Message.Directory, file - yield Message.Url, url, text.nameext_from_url(url, file) + + if url: + yield Message.Directory, post + yield Message.Url, url, text.nameext_from_url(url, post) + else: + self.log.warning( + "Unable to get download URL for '%s'", post_url) def posts(self): """Return urls of all relevant image pages""" @@ -82,7 +84,10 @@ class NewgroundsExtractor(Extractor): } def extract_post(self, post_url): - page = self.request(post_url).text + response = self.request(post_url, fatal=False) + if response.status_code >= 400: + return {} + page = response.text extr = text.extract_from(page) if "/art/view/" in post_url: @@ -97,8 +102,7 @@ class NewgroundsExtractor(Extractor): data["favorites"] = text.parse_int(extr( 'id="faves_load">', '<').replace(",", "")) data["score"] = text.parse_float(extr('id="score_number">', '<')) - data["tags"] = text.split_html(extr( - '<dd class="tags">', '</dd>')) + data["tags"] = text.split_html(extr('<dd class="tags">', '</dd>')) data["artist"] = [ text.extract(user, '//', '.')[0] for user in text.extract_iter(page, '<div class="item-user">', '>') @@ -194,7 +198,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): "keyword": { "artist" : ["tomfulp"], "comment" : "re:Consider this the bottom threshold for ", - "date" : "type:datetime", + "date" : "dt:2009-06-04 14:44:05", "description": "re:Consider this the bottom threshold for ", "favorites" : int, "filename" : "94_tomfulp_ryu-is-hawt", @@ -241,7 +245,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "keyword": { "artist" : ["psychogoldfish", "tomfulp"], "comment" : "re:People have been asking me how I like the ", - "date" : "type:datetime", + "date" : "dt:2012-02-08 21:40:56", "description": "re:People have been asking how I like the ", "favorites" : int, "filename" : "527818_alternate_1896", @@ -259,7 +263,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "keyword": { "artist" : ["zj", "tomfulp"], "comment" : "re:RECORDED 12-09-2014\n\nFrom The ZJ \"Late ", - "date" : "type:datetime", + "date" : "dt:2015-02-23 19:31:59", "description": "From The ZJ Report Show!", "favorites" : int, "index" : 609768, @@ -334,3 +338,53 @@ class NewgroundsUserExtractor(NewgroundsExtractor): (NewgroundsAudioExtractor , base + "audio"), (NewgroundsMoviesExtractor, base + "movies"), ), ("art",)) + + +class NewgroundsFavoriteExtractor(NewgroundsExtractor): + """Extractor for posts favorited by a newgrounds user""" + subcategory = "favorite" + directory_fmt = ("{category}", "{user}", "Favorites") + pattern = (r"(?:https?://)?([^.]+)\.newgrounds\.com" + r"/favorites(?:/(art|audio|movies))?/?") + test = ( + ("https://tomfulp.newgrounds.com/favorites/art", { + "range": "1-10", + "count": ">= 10", + }), + ("https://tomfulp.newgrounds.com/favorites/audio"), + ("https://tomfulp.newgrounds.com/favorites/movies"), + ("https://tomfulp.newgrounds.com/favorites/"), + ) + + def __init__(self, match): + NewgroundsExtractor.__init__(self, match) + self.kind = match.group(2) + + def posts(self): + if self.kind: + return self._pagination(self.kind) + return itertools.chain.from_iterable( + self._pagination(k) for k in ("art", "audio", "movies") + ) + + def _pagination(self, kind): + num = 1 + headers = { + "Accept": "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Referer": self.user_root, + } + + while True: + url = "{}/favorites/{}/{}".format(self.user_root, kind, num) + response = self.request(url, headers=headers) + if response.history: + return + + favs = list(text.extract_iter( + response.text, 'href="//www.newgrounds.com', '"')) + for path in favs: + yield self.root + path + if len(favs) < 24: + return + num += 1 diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 97be789..dfe31e3 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -86,7 +86,7 @@ class NozomiPostExtractor(NozomiExtractor): "character": ["patchouli knowledge"], "copyright": ["touhou"], "dataid" : "re:aaa9f7c632cde1e1a5baaff3fb6a6d857ec73df7fdc5cf5a", - "date" : "type:datetime", + "date" : "dt:2016-07-26 02:32:03", "extension": "jpg", "favorites": int, "filename" : str, diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 74835bf..2f5b429 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Utility classes to setup OAuth and link a users account to gallery-dl""" +"""Utility classes to setup OAuth and link accounts to gallery-dl""" from .common import Extractor, Message from . import deviantart, flickr, reddit, smugmug, tumblr @@ -38,7 +38,7 @@ class OAuthBase(Extractor): print("Waiting for response. (Cancel with Ctrl+c)") server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(("localhost", 6414)) + server.bind(("localhost", self.config("port", 6414))) server.listen(1) # workaround for ctrl+c not working during server.accept on Windows @@ -98,7 +98,7 @@ class OAuthBase(Extractor): def _oauth2_authorization_code_grant( self, client_id, client_secret, auth_url, token_url, scope="read", key="refresh_token", auth=True, - message_template=None): + message_template=None, cache=None): """Perform an OAuth2 authorization code grant""" state = "gallery-dl_{}_{}".format( @@ -162,6 +162,11 @@ class OAuthBase(Extractor): client_secret=client_secret, )) + # write to cache + if cache and config.get(("extractor", self.category), "cache"): + cache.update("#" + str(client_id), data[key]) + self.log.info("Writing 'refresh-token' to cache") + class OAuthDeviantart(OAuthBase): subcategory = "deviantart" @@ -179,6 +184,7 @@ class OAuthDeviantart(OAuthBase): "https://www.deviantart.com/oauth2/authorize", "https://www.deviantart.com/oauth2/token", scope="browse", + cache=deviantart._refresh_token_cache, ) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index a4731d0..931fb13 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -45,7 +45,7 @@ class PahealTagExtractor(PahealExtractor): directory_fmt = ("{category}", "{search_tags}") pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/list/([^/?&#]+)") - test = ("https://rule34.paheal.net/post/list/k-on/1", { + test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", "count": ">= 15" }) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 1e52559..0d51df2 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,6 +12,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import memcache import collections +import itertools import json @@ -33,43 +34,62 @@ class PatreonExtractor(Extractor): PatreonExtractor._warning = False for post in self.posts(): - ids = set() post["num"] = 0 - content = post.get("content") - postfile = post.get("post_file") + hashes = set() yield Message.Directory, post yield Message.Metadata, text.nameext_from_url( post["creator"].get("image_url", ""), post) - for image in post["images"]: - url = image.get("download_url") - if not url: - continue - ids.add(url.split("/")[-2]) - name = image.get("file_name") or self._filename(url) or url + for kind, url, name in itertools.chain( + self._postfile(post), + self._images(post), + self._attachments(post), + self._content(post), + ): + fhash = url.rsplit("/", 2)[1] + if fhash not in hashes: + hashes.add(fhash) + post["hash"] = fhash + post["type"] = kind + post["num"] += 1 + yield Message.Url, url, text.nameext_from_url(name, post) + else: + self.log.debug("skipping %s (%s %s)", url, fhash, kind) - post["num"] += 1 - post["type"] = "image" - yield Message.Url, url, text.nameext_from_url(name, post) + @staticmethod + def _postfile(post): + postfile = post.get("post_file") + if postfile: + return (("postfile", postfile["url"], postfile["name"]),) + return () + + def _images(self, post): + for image in post["images"]: + url = image.get("download_url") + if url: + name = image.get("file_name") or self._filename(url) or url + yield "image", url, name - if postfile and postfile["url"].split("/")[-2] not in ids: - post["num"] += 1 - post["type"] = "postfile" - text.nameext_from_url(postfile["name"], post) - yield Message.Url, postfile["url"], post + def _attachments(self, post): + for attachment in post["attachments"]: + url = self.request( + attachment["url"], method="HEAD", + allow_redirects=False, fatal=False, + ).headers.get("Location") - for attachment in post["attachments"]: - post["num"] += 1 - post["type"] = "attachment" - text.nameext_from_url(attachment["name"], post) - yield Message.Url, attachment["url"], post + if url: + yield "attachment", url, attachment["name"] - if content: - for url in text.extract_iter(content, 'src="', '"'): - post["num"] += 1 - post["type"] = "content" - yield Message.Url, url, text.nameext_from_url(url, post) + @staticmethod + def _content(post): + content = post.get("content") + if content: + for img in text.extract_iter( + content, '<img data-media-id="', '>'): + url = text.extract(img, 'src="', '"')[0] + if url: + yield "content", url, url def posts(self): """Return all relevant post objects""" @@ -238,11 +258,13 @@ class PatreonPostExtractor(PatreonExtractor): subcategory = "post" pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?&#]+)" test = ( + # postfile + attachments ("https://www.patreon.com/posts/precious-metal-23563293", { "count": 4, }), - ("https://www.patreon.com/posts/er1-28201153", { - "count": 1, + # postfile + content + ("https://www.patreon.com/posts/19987002", { + "count": 4, }), ("https://www.patreon.com/posts/not-found-123", { "exception": exception.NotFoundError, diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py index 5f50245..35f9f91 100644 --- a/gallery_dl/extractor/piczel.py +++ b/gallery_dl/extractor/piczel.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,27 +22,30 @@ class PiczelExtractor(Extractor): def items(self): yield Message.Version, 1 - for image in self.unpack(self.images()): - url = self.root + "/static" + image["image"]["image"]["url"] - yield Message.Directory, image - yield Message.Url, url, text.nameext_from_url(url, image) - - @staticmethod - def unpack(images): - """Unpack 'images' into individual image objects""" - for image in images: - if image["multi"]: - multi = image["images"] - del image["images"] - for image["num"], img in enumerate(multi): - image["image"] = img - yield image + for post in self.posts(): + post["tags"] = [t["title"] for t in post["tags"] if t["title"]] + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + + if post["multi"]: + images = post["images"] + del post["images"] + yield Message.Directory, post + for post["num"], image in enumerate(images): + if "id" in image: + del image["id"] + post.update(image) + url = post["image"]["url"] + yield Message.Url, url, text.nameext_from_url(url, post) + else: - image["num"] = 0 - yield image + yield Message.Directory, post + post["num"] = 0 + url = post["image"]["url"] + yield Message.Url, url, text.nameext_from_url(url, post) - def images(self): - """Return an iterable with all relevant image objects""" + def posts(self): + """Return an iterable with all relevant post objects""" def _pagination(self, url, folder_id=None): params = { @@ -53,26 +56,26 @@ class PiczelExtractor(Extractor): while True: data = self.request(url, params=params).json() - yield from data - - if len(data) < 32: + if not data: return params["from_id"] = data[-1]["id"] + yield from data class PiczelUserExtractor(PiczelExtractor): """Extractor for all images from a user's gallery""" subcategory = "user" pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$" - test = ("https://piczel.tv/gallery/Maximumwarp", { - "count": ">= 45", + test = ("https://piczel.tv/gallery/Bikupan", { + "range": "1-100", + "count": ">= 100", }) def __init__(self, match): PiczelExtractor.__init__(self, match) self.user = match.group(1) - def images(self): + def posts(self): url = "{}/api/users/{}/gallery".format(self.root, self.user) return self._pagination(url) @@ -92,7 +95,7 @@ class PiczelFolderExtractor(PiczelExtractor): PiczelExtractor.__init__(self, match) self.user, self.folder_id = match.groups() - def images(self): + def posts(self): url = "{}/api/users/{}/gallery".format(self.root, self.user) return self._pagination(url, self.folder_id) @@ -106,6 +109,7 @@ class PiczelImageExtractor(PiczelExtractor): "content": "df9a053a24234474a19bce2b7e27e0dec23bff87", "keyword": { "created_at": "2018-07-22T05:13:58.000Z", + "date": "dt:2018-07-22 05:13:58", "description": None, "extension": "png", "favorites_count": int, @@ -118,7 +122,7 @@ class PiczelImageExtractor(PiczelExtractor): "nsfw": False, "num": 0, "password_protected": False, - "tags": "fanart, commission, altair, recreators, ", + "tags": ["fanart", "commission", "altair", "recreators"], "title": "Altair", "user": dict, "views": int, @@ -129,6 +133,6 @@ class PiczelImageExtractor(PiczelExtractor): PiczelExtractor.__init__(self, match) self.image_id = match.group(1) - def images(self): + def posts(self): url = "{}/api/gallery/image/{}".format(self.root, self.image_id) return (self.request(url).json(),) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 8a10028..eaf97fd 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -12,6 +12,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache from datetime import datetime, timedelta +import itertools import hashlib import time @@ -27,11 +28,11 @@ class PixivExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.api = PixivAppAPI(self) - self.user_id = -1 self.load_ugoira = self.config("ugoira", True) def items(self): - metadata = self.get_metadata() + ratings = {0: "General", 1: "R-18", 2: "R-18G"} + metadata = self.metadata() yield Message.Version, 1 for work in self.works(): @@ -46,6 +47,7 @@ class PixivExtractor(Extractor): work["num"] = 0 work["tags"] = [tag["name"] for tag in work["tags"]] work["date"] = text.parse_datetime(work["create_date"]) + work["rating"] = ratings.get(work["x_restrict"]) work["suffix"] = "" work.update(metadata) @@ -74,11 +76,9 @@ class PixivExtractor(Extractor): def works(self): """Return an iterable containing all relevant 'work'-objects""" - def get_metadata(self, user=None): + def metadata(self): """Collect metadata for extractor-job""" - if not user: - user = self.api.user_detail(self.user_id) - return {"user": user} + return {} class PixivUserExtractor(PixivExtractor): @@ -102,8 +102,15 @@ class PixivUserExtractor(PixivExtractor): "&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), { "url": "25b1cd81153a8ff82eec440dd9f20a4a22079658", }), + # avatar (#595, 623) + ("https://www.pixiv.net/en/users/173530", { + "options": (("avatar", True),), + "content": "22af450d4dbaf4973d370f164f66f48c7382a6de", + "range": "1", + }), + # deleted account ("http://www.pixiv.net/member_illust.php?id=173531", { - "exception": exception.NotFoundError, + "count": 0, }), ("https://www.pixiv.net/en/users/173530"), ("https://www.pixiv.net/en/users/173530/manga"), @@ -136,6 +143,27 @@ class PixivUserExtractor(PixivExtractor): if tag in [t["name"].lower() for t in work["tags"]] ) + if self.config("avatar"): + user = self.api.user_detail(self.user_id) + url = user["profile_image_urls"]["medium"].replace("_170.", ".") + avatar = { + "create_date" : None, + "height" : 0, + "id" : "avatar", + "image_urls" : None, + "meta_pages" : (), + "meta_single_page": {"original_image_url": url}, + "page_count" : 1, + "sanity_level" : 0, + "tags" : (), + "title" : "avatar", + "type" : "avatar", + "user" : user, + "width" : 0, + "x_restrict" : 0, + } + works = itertools.chain((avatar,), works) + return works @@ -203,15 +231,9 @@ class PixivWorkExtractor(PixivExtractor): def __init__(self, match): PixivExtractor.__init__(self, match) self.illust_id = match.group(1) or match.group(2) - self.load_ugoira = True - self.work = None def works(self): - return (self.work,) - - def get_metadata(self, user=None): - self.work = self.api.illust_detail(self.illust_id) - return PixivExtractor.get_metadata(self, self.work["user"]) + return (self.api.illust_detail(self.illust_id),) class PixivFavoriteExtractor(PixivExtractor): @@ -220,8 +242,8 @@ class PixivFavoriteExtractor(PixivExtractor): directory_fmt = ("{category}", "bookmarks", "{user_bookmark[id]} {user_bookmark[account]}") archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/" - r"(?:(?:en/)?users/(\d+)/(bookmarks/artworks|following)" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?" + r"users/(\d+)/(bookmarks/artworks(?:/([^/?&#]+))?|following)" r"|bookmark\.php(?:\?([^#]*))?)") test = ( ("https://www.pixiv.net/en/users/173530/bookmarks/artworks", { @@ -231,20 +253,29 @@ class PixivFavoriteExtractor(PixivExtractor): "url": "e717eb511500f2fa3497aaee796a468ecf685cc4", }), # bookmarks with specific tag + (("https://www.pixiv.net/en/users/3137110" + "/bookmarks/artworks/%E3%81%AF%E3%82%93%E3%82%82%E3%82%93"), { + "url": "379b28275f786d946e01f721e54afe346c148a8c", + }), + # bookmarks with specific tag (legacy url) (("https://www.pixiv.net/bookmark.php?id=3137110" "&tag=%E3%81%AF%E3%82%93%E3%82%82%E3%82%93&p=1"), { - "count": 2, + "url": "379b28275f786d946e01f721e54afe346c148a8c", }), # own bookmarks ("https://www.pixiv.net/bookmark.php", { "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba", }), + # own bookmarks with tag (#596) + ("https://www.pixiv.net/bookmark.php?tag=foobar", { + "count": 0, + }), # followed users (#515) ("https://www.pixiv.net/en/users/173530/following", { "pattern": PixivUserExtractor.pattern, "count": ">= 12", }), - # followed users (#515) + # followed users (legacy url) (#515) ("https://www.pixiv.net/bookmark.php?id=173530&type=user", { "pattern": PixivUserExtractor.pattern, "count": ">= 12", @@ -255,11 +286,11 @@ class PixivFavoriteExtractor(PixivExtractor): ) def __init__(self, match): - uid, kind, query = match.groups() + uid, kind, self.tag, query = match.groups() if query: self.query = text.parse_query(query) - uid = self.query.get("id", -1) + uid = self.query.get("id") if not uid: self.subcategory = "bookmark" elif self.query.get("type") == "user": @@ -280,12 +311,15 @@ class PixivFavoriteExtractor(PixivExtractor): if "tag" in self.query: tag = text.unquote(self.query["tag"]) + elif self.tag: + tag = text.unquote(self.tag) + if "rest" in self.query and self.query["rest"] == "hide": restrict = "private" return self.api.user_bookmarks_illust(self.user_id, tag, restrict) - def get_metadata(self, user=None): + def metadata(self): if self.user_id: user = self.api.user_detail(self.user_id) else: @@ -301,7 +335,7 @@ class PixivFavoriteExtractor(PixivExtractor): for preview in self.api.user_following(self.user_id): user = preview["user"] user["_extractor"] = PixivUserExtractor - url = "https://www.pixiv.net/member.php?id={}".format(user["id"]) + url = "https://www.pixiv.net/users/{}".format(user["id"]) yield Message.Queue, url, user @@ -327,7 +361,7 @@ class PixivRankingExtractor(PixivExtractor): def works(self): return self.api.illust_ranking(self.mode, self.date) - def get_metadata(self, user=None): + def metadata(self): query = text.parse_query(self.query) mode = query.get("mode", "daily").lower() @@ -393,7 +427,7 @@ class PixivSearchExtractor(PixivExtractor): def works(self): return self.api.search_illust(self.word, self.sort, self.target) - def get_metadata(self, user=None): + def metadata(self): query = text.parse_query(self.query) if self.word: @@ -446,7 +480,7 @@ class PixivFollowExtractor(PixivExtractor): def works(self): return self.api.illust_follow() - def get_metadata(self, user=None): + def metadata(self): self.api.login() return {"user_follow": self.api.user} diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py index aa5c9c6..721fc2f 100644 --- a/gallery_dl/extractor/pururin.py +++ b/gallery_dl/extractor/pururin.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,8 +21,8 @@ class PururinGalleryExtractor(GalleryExtractor): ("https://pururin.io/gallery/38661/iowant-2", { "pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg", "keyword": { - "title" : "Iowant 2!!", - "title_en" : "Iowant 2!!", + "title" : "re:I ?owant 2!!", + "title_en" : "re:I ?owant 2!!", "title_jp" : "", "gallery_id": 38661, "count" : 19, diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py index 6d89151..70b4833 100644 --- a/gallery_dl/extractor/realbooru.py +++ b/gallery_dl/extractor/realbooru.py @@ -30,7 +30,7 @@ class RealbooruTagExtractor(booru.TagMixin, RealbooruExtractor): pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?" r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") test = ("https://realbooru.com/index.php?page=post&s=list&tags=wine", { - "count": 64, + "count": ">= 64", }) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 4c83019..a312c1c 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -9,74 +9,104 @@ """Extractors for https://www.reddit.com/""" from .common import Extractor, Message -from .. import text, util, extractor, exception +from .. import text, util, exception from ..cache import cache class RedditExtractor(Extractor): """Base class for reddit extractors""" category = "reddit" + directory_fmt = ("{category}", "{subreddit}") + filename_fmt = "{id} {title[:242]}.{extension}" + archive_fmt = "{filename}" cookiedomain = None def __init__(self, match): Extractor.__init__(self, match) self.api = RedditAPI(self) - self.max_depth = int(self.config("recursion", 0)) - self._visited = set() + self.max_depth = self.config("recursion", 0) def items(self): - subre = RedditSubmissionExtractor.pattern + match_submission = RedditSubmissionExtractor.pattern.match + match_subreddit = RedditSubredditExtractor.pattern.match + match_user = RedditUserExtractor.pattern.match + + parentdir = self.config("parent-directory") + videos = self.config("videos", True) + submissions = self.submissions() + visited = set() depth = 0 yield Message.Version, 1 - with extractor.blacklist( - util.SPECIAL_EXTRACTORS, - [RedditSubredditExtractor, RedditUserExtractor]): - while True: - extra = [] - for url, data in self._urls(submissions): - if url[0] == "#": + + while True: + extra = [] + + for submission, comments in submissions: + urls = [] + + if submission: + yield Message.Directory, submission + visited.add(submission["id"]) + url = submission["url"] + + if url.startswith("https://i.redd.it/"): + text.nameext_from_url(url, submission) + yield Message.Url, url, submission + + elif submission["is_video"]: + if videos: + text.nameext_from_url(url, submission) + if videos == "ytdl": + url = "https://www.reddit.com" + \ + submission["permalink"] + else: + submission["_ytdl_extra"] = { + "title": submission["title"], + } + yield Message.Url, "ytdl:" + url, submission + + elif not submission["is_self"]: + urls.append((url, submission)) + + elif parentdir: + yield Message.Directory, comments[0] + + if self.api.comments: + if submission: + for url in text.extract_iter( + submission["selftext_html"] or "", + ' href="', '"'): + urls.append((url, submission)) + for comment in comments: + for url in text.extract_iter( + comment["body_html"] or "", ' href="', '"'): + urls.append((url, comment)) + + for url, data in urls: + if not url or url[0] == "#": continue if url[0] == "/": url = "https://www.reddit.com" + url - match = subre.match(url) + match = match_submission(url) if match: extra.append(match.group(1)) - else: + elif not match_user(url) and not match_subreddit(url): yield Message.Queue, text.unescape(url), data - if not extra or depth == self.max_depth: - return - depth += 1 - submissions = ( - self.api.submission(sid) for sid in extra - if sid not in self._visited - ) + if not extra or depth == self.max_depth: + return + depth += 1 + submissions = ( + self.api.submission(sid) for sid in extra + if sid not in self._visited + ) def submissions(self): """Return an iterable containing all (submission, comments) tuples""" - def _urls(self, submissions): - for submission, comments in submissions: - - if submission: - self._visited.add(submission["id"]) - - if not submission["is_self"]: - yield submission["url"], submission - - for url in text.extract_iter( - submission["selftext_html"] or "", ' href="', '"'): - yield url, submission - - if comments: - for comment in comments: - for url in text.extract_iter( - comment["body_html"] or "", ' href="', '"'): - yield url, comment - class RedditSubredditExtractor(RedditExtractor): """Extractor for URLs from subreddits on reddit.com""" @@ -84,7 +114,10 @@ class RedditSubredditExtractor(RedditExtractor): pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/" r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?(?:$|#)") test = ( - ("https://www.reddit.com/r/lavaporn/"), + ("https://www.reddit.com/r/lavaporn/", { + "range": "1-20", + "count": ">= 20", + }), ("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"), ("https://old.reddit.com/r/lavaporn/"), ("https://np.reddit.com/r/lavaporn/"), @@ -210,7 +243,7 @@ class RedditAPI(): link_id = "t3_" + submission_id if self.morecomments else None submission, comments = self._call(endpoint, {"limit": self.comments}) return (submission["data"]["children"][0]["data"], - self._flatten(comments, link_id) if self.comments else None) + self._flatten(comments, link_id) if self.comments else ()) def submissions_subreddit(self, subreddit, params): """Collect all (submission, comments)-tuples of a subreddit""" @@ -290,7 +323,8 @@ class RedditAPI(): raise exception.AuthorizationError() if data["error"] == 404: raise exception.NotFoundError() - raise Exception(data["message"]) + self.log.debug(data) + raise exception.StopExtraction(data.get("message")) return data def _pagination(self, endpoint, params): @@ -315,7 +349,7 @@ class RedditAPI(): except exception.AuthorizationError: pass else: - yield post, None + yield post, () elif kind == "t1" and self.comments: yield None, (post,) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 2c9746e..521b034 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -83,11 +83,11 @@ class SexcomExtractor(Extractor): data["url"] = "ytdl:" + text.extract( extr('<iframe', '>'), ' src="', '"')[0] else: - data["url"] = extr(' src="', '"') + data["url"] = text.unescape(extr(' src="', '"').partition("?")[0]) text.nameext_from_url(data["url"], data) data["uploader"] = extr('itemprop="author">', '<') - data["date"] = extr('datetime="', '"') + data["date"] = text.parse_datetime(extr('datetime="', '"')) data["tags"] = text.split_html(extr('class="tags"> Tags', '</div>')) data["comments"] = text.parse_int(extr('Comments (', ')')) @@ -102,28 +102,28 @@ class SexcomPinExtractor(SexcomExtractor): test = ( # picture ("https://www.sex.com/pin/56714360/", { - "url": "599190d6e3d79f9f49dda194a0a58cb0ffa3ab86", - "content": "963ed681cf53904173c7581b713c7f9471f04db0", + "pattern": "https://cdn.sex.com/images/.+/2018/10/02/20037816.jpg", + "content": "e579e3283fea812d0545a3f79734b79bc3c51acb", "keyword": { - "comments": int, - "date": "2018-10-02T21:18:17-04:00", + "comments" : int, + "date" : "dt:2018-10-02 21:18:17", "extension": "jpg", - "filename": "20037816", - "likes": int, - "pin_id": 56714360, - "repins": int, - "tags": list, + "filename" : "20037816", + "likes" : int, + "pin_id" : 56714360, + "repins" : int, + "tags" : list, "thumbnail": str, - "title": "Pin #56714360", - "type": "picture", - "uploader": "alguem", - "url": str, + "title" : "Pin #56714360", + "type" : "picture", + "uploader" : "alguem", + "url" : str, }, }), # gif ("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", { - "url": "98a82c5ae7a65c8228e1405ac740f80d4d556de1", - "content": "a54b37eb39d565094c54ad7d21244fe8f978fb14", + "pattern": "https://cdn.sex.com/images/.+/2014/01/26/4829951.gif", + "content": "af6726d74d11d819e1c885fe5303f711862eae96", }), # video ("https://www.sex.com/pin/55748341/", { @@ -134,10 +134,6 @@ class SexcomPinExtractor(SexcomExtractor): ("https://www.sex.com/pin/55847384-very-nicely-animated/", { "pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2", }), - # 404 - ("https://www.sex.com/pin/55847385/", { - "count": 0, - }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index 298b7e0..31dbdad 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -52,7 +52,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): "title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou", "title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本", "gallery_id": 40996, - "date" : "type:datetime", + "date" : "dt:2018-06-29 00:00:00", "count" : 42, "collection": "", "artist" : ["Itou Life"], diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index a1f2199..0505fa9 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -310,7 +310,7 @@ class TumblrTagExtractor(TumblrExtractor): def __init__(self, match): TumblrExtractor.__init__(self, match) - self.tag = text.unquote(match.group(3)) + self.tag = text.unquote(match.group(3).replace("-", " ")) def posts(self): return self.api.posts(self.blog, {"tag": self.tag}) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index dc558c0..2a04463 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://twitter.com/""" +"""Extractors for https://twitter.com/""" from .common import Extractor, Message from .. import text, exception @@ -21,8 +21,11 @@ class TwitterExtractor(Extractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" + cookiedomain = ".twitter.com" root = "https://twitter.com" sizes = (":orig", ":large", ":medium", ":small") + user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; " + "Trident/7.0; rv:11.0) like Gecko") def __init__(self, match): Extractor.__init__(self, match) @@ -32,7 +35,7 @@ class TwitterExtractor(Extractor): self.retweets = self.config("retweets", True) self.twitpic = self.config("twitpic", False) self.content = self.config("content", False) - self.videos = self.config("videos", False) + self.videos = self.config("videos", True) if self.content: self._emoji_sub = re.compile( @@ -117,7 +120,8 @@ class TwitterExtractor(Extractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - page = self.request(self.root + "/login").text + headers = {"User-Agent": self.user_agent} + page = self.request(self.root + "/login", headers=headers).text pos = page.index('name="authenticity_token"') token = text.extract(page, 'value="', '"', pos-80)[0] @@ -131,11 +135,15 @@ class TwitterExtractor(Extractor): "redirect_after_login" : "", "remember_me" : "1", } - response = self.request(url, method="POST", data=data) - + response = self.request(url, method="POST", headers=headers, data=data) if "/error" in response.url: raise exception.AuthenticationError() - return self.session.cookies + + return { + cookie.name: cookie.value + for cookie in self.session.cookies + if cookie.domain and "twitter.com" in cookie.domain + } def _data_from_tweet(self, tweet): extr = text.extract_from(tweet) @@ -353,7 +361,11 @@ class TwitterTweetExtractor(TwitterExtractor): # content with emoji, newlines, hashtags (#338) ("https://twitter.com/yumi_san0112/status/1151144618936823808", { "options": (("content", True),), - "keyword": "0b7a3d05607b480c1412dfd85f8606478313e7bf", + "keyword": {"content": ( + "re:晴、お誕生日おめでとう🎉!\n実は下の名前が同じなので結構親近感ある" + "アイドルです✨\n今年の晴ちゃんめちゃくちゃ可愛い路線攻めてるから、そろ" + "そろまたかっこいい晴が見たいですねw\n#結城晴生誕祭2019\n#結城晴生誕祭" + )}, }), # Reply to another tweet (#403) ("https://twitter.com/tyson_hesse/status/1103767554424598528", { @@ -365,9 +377,12 @@ class TwitterTweetExtractor(TwitterExtractor): "pattern": r"https://pbs.twimg.com/media/EAel0vUUYAAZ4Bq.jpg:orig", }), # quoted tweet (#526) - ("https://twitter.com/Meiyu_miu/status/1070693241413021696", { - "count": 4, - "keyword": "0c627af2b8cdccc7e0da8fd221155c4a4a3141a8", + ("https://twitter.com/Pistachio/status/1222690391817932803", { + "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg", + "keyword": { + "author": {"name": "Afro_Herper", "id": 786047748508221440}, + "user" : {"name": "Pistachio" , "id": 3533231}, + }, }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { @@ -384,11 +399,7 @@ class TwitterTweetExtractor(TwitterExtractor): def tweets(self): url = "{}/i/web/status/{}".format(self.root, self.tweet_id) cookies = {"app_shell_visited": "1"} - headers = { - "Referer" : url, - "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; " - "Trident/7.0; rv:11.0) like Gecko", - } + headers = {"User-Agent": self.user_agent, "Referer": url} response = self.request(url, cookies=cookies, headers=headers) if response.history and response.url == self.root + "/": @@ -400,6 +411,81 @@ class TwitterTweetExtractor(TwitterExtractor): return (page[beg:end],) +class TwitterBookmarkExtractor(TwitterExtractor): + """Extractor for bookmarked tweets""" + subcategory = "bookmark" + pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()" + test = ("https://twitter.com/i/bookmarks",) + + def items(self): + self.login() + if not self.logged_in: + raise exception.AuthorizationError("Login required") + for cookie in self.session.cookies: + cookie.expires = None + + url = "https://api.twitter.com/2/timeline/bookmark.json" + params = { + "include_profile_interstitial_type": "1", + "include_blocking": "1", + "include_blocked_by": "1", + "include_followed_by": "1", + "include_want_retweets": "1", + "include_mute_edge": "1", + "include_can_dm": "1", + "include_can_media_tag": "1", + "skip_status": "1", + "cards_platform": "Web-12", + "include_cards": "1", + "include_composer_source": "true", + "include_ext_alt_text": "true", + "include_reply_count": "1", + "tweet_mode": "extended", + "include_entities": "true", + "include_user_entities": "true", + "include_ext_media_color": "true", + "include_ext_media_availability": "true", + "send_error_codes": "true", + "simple_quoted_tweets": "true", + "count": "100", + "cursor": None, + "ext": "mediaStats%2CcameraMoment", + } + headers = { + "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" + "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" + "4FA33AGWWjCpTnA", + "Origin": self.root, + "Referer": self.root + "/i/bookmarks", + "x-csrf-token": self.session.cookies.get("ct0"), + "x-twitter-active-user": "yes", + "x-twitter-auth-type": "Auth2Session", + "x-twitter-client-language": "en", + } + + while True: + response = self.request( + url, params=params, headers=headers, fatal=False) + if response.status_code >= 400: + raise exception.StopExtraction(response.text) + data = response.json() + tweets = data["globalObjects"]["tweets"] + + if not tweets: + return + for tweet_id, tweet_data in tweets.items(): + tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id) + tweet_data["_extractor"] = TwitterTweetExtractor + yield Message.Queue, tweet_url, tweet_data + + inst = data["timeline"]["instructions"][0] + for entry in inst["addEntries"]["entries"]: + if entry["entryId"].startswith("cursor-bottom-"): + params["cursor"] = \ + entry["content"]["operation"]["cursor"]["value"] + break + + @memcache() def _guest_token(extr, headers): return extr.request( diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index a24d3fe..a020064 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -34,6 +34,9 @@ class VscoExtractor(Extractor): yield Message.Directory, {"user": self.user} for img in self.images(): + if not img or "responsive_url" not in img: + continue + if img["is_video"]: if not videos: continue @@ -98,6 +101,8 @@ class VscoExtractor(Extractor): @staticmethod def _transform_media(media): + if "responsiveUrl" not in media: + return None media["_id"] = media["id"] media["is_video"] = media["isVideo"] media["grid_name"] = media["gridName"] @@ -111,18 +116,19 @@ class VscoExtractor(Extractor): class VscoUserExtractor(VscoExtractor): """Extractor for images from a user on vsco.co""" subcategory = "user" - pattern = BASE_PATTERN + r"(?:/images(?:/\d+)?)?/?(?:$|[?#])" + pattern = BASE_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])" test = ( - ("https://vsco.co/missuri/images/1", { + ("https://vsco.co/missuri/gallery", { "pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+", "range": "1-80", "count": 80, }), + ("https://vsco.co/missuri/images/1"), ("https://vsco.co/missuri"), ) def images(self): - url = "{}/{}/images/1".format(self.root, self.user) + url = "{}/{}/gallery".format(self.root, self.user) data = self._extract_preload_state(url) tkn = data["users"]["currentUser"]["tkn"] @@ -186,7 +192,7 @@ class VscoImageExtractor(VscoExtractor): "grid" : "erenyildiz", "meta" : dict, "tags" : list, - "date" : "type:datetime", + "date" : "dt:2019-07-21 19:12:11", "video" : False, "width" : 1537, "height": 1537, diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 737c253..043da0b 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://wallhaven.cc/""" +"""Extractors for https://wallhaven.cc/""" from .common import Extractor, Message from .. import text @@ -77,7 +77,7 @@ class WallhavenImageExtractor(WallhavenExtractor): "group" : "Owner/Developer", "username" : "AksumkA", }, - "date" : "type:datetime", + "date" : "dt:2014-08-31 06:17:19", "wh_category": "anime", "views" : int, "favorites" : int, diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 49fa082..6a779d9 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -56,7 +56,7 @@ class WeiboExtractor(Extractor): info = obj["page_info"]["media_info"] url = info.get("stream_url_hd") or info.get("stream_url") - if url and not info.get("goto"): + if url: data = text.nameext_from_url(url, { "num" : num, "pid" : 0, @@ -65,6 +65,10 @@ class WeiboExtractor(Extractor): "height": 0, "status": status, }) + if data["extension"] == "m3u8": + url = "ytdl:" + url + data["extension"] = "mp4" + data["_ytdl_extra"] = {"protocol": "m3u8_native"} yield Message.Url, url, data if self.retweets and "retweeted_status" in obj: diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 62acb28..0422589 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -49,7 +49,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor): "pageURL": str, "thumbURL": str, "gallery": { - "date": "type:datetime", + "date": "dt:2019-04-16 00:07:31", "description": "", "dislikes": int, "id": 11748968, diff --git a/gallery_dl/extractor/yaplog.py b/gallery_dl/extractor/yaplog.py deleted file mode 100644 index b07ba4b..0000000 --- a/gallery_dl/extractor/yaplog.py +++ /dev/null @@ -1,128 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://yaplog.jp/""" - -from .common import Extractor, Message, AsynchronousMixin -from .. import text, util - - -BASE_PATTERN = r"(?:https?://)?(?:www\.)?yaplog\.jp/([\w-]+)" - - -class YaplogExtractor(AsynchronousMixin, Extractor): - """Base class for yaplog extractors""" - category = "yaplog" - root = "https://yaplog.jp" - filename_fmt = "{post[id]}_{post[title]}_{id}.{extension}" - directory_fmt = ("{category}", "{post[user]}") - archive_fmt = "{post[user]}_{id}" - - def __init__(self, match): - Extractor.__init__(self, match) - self.user = match.group(1) - - def items(self): - yield Message.Version, 1 - for post, urls in self.posts(): - yield Message.Directory, {"post": post} - for num, url in enumerate(urls, 1): - page = self.request(url).text if num > 1 else url - iurl = text.extract(page, '<img src="', '"')[0] - if iurl[0] == "/": - iurl = text.urljoin(self.root, iurl) - name, _, ext = iurl.rpartition("/")[2].rpartition(".") - iid = name.rpartition("_")[0] or name - image = { - "url" : iurl, - "num" : num, - "id" : text.parse_int(iid, iid), - "filename" : name, - "extension": ext, - "post" : post, - } - yield Message.Url, iurl, image - - def posts(self): - """Return an iterable with (data, image page URLs) tuples""" - - def _parse_post(self, url): - page = self.request(url).text - title, pos = text.extract(page, 'class="title">', '<') - date , pos = text.extract(page, 'class="date">' , '<', pos) - pid , pos = text.extract(page, '/archive/' , '"', pos) - prev , pos = text.extract(page, 'class="last"><a href="', '"', pos) - - urls = list(text.extract_iter(page, '<li><a href="', '"', pos)) - if urls: - urls[0] = page # cache HTML of first page - - if len(urls) == 24 and text.extract(page, '(1/', ')')[0] != '24': - # there are a maximum of 24 image entries in an /image/ page - # -> search /archive/ page for the rest - url = "{}/{}/archive/{}".format(self.root, self.user, pid) - page = self.request(url).text - - base = "{}/{}/image/{}/".format(self.root, self.user, pid) - for part in util.advance(text.extract_iter( - page, base, '"', pos), 24): - urls.append(base + part) - - return prev, urls, { - "id" : text.parse_int(pid), - "title": text.unescape(title[:-3]), - "user" : self.user, - "date" : text.parse_datetime(date, "%B %d [%a], %Y, %H:%M"), - } - - -class YaplogBlogExtractor(YaplogExtractor): - """Extractor for a user's blog on yaplog.jp""" - subcategory = "blog" - pattern = BASE_PATTERN + r"/?(?:$|[?&#])" - test = ("https://yaplog.jp/omitakashi3", { - "pattern": r"https://img.yaplog.jp/img/18/pc/o/m/i/omitakashi3/0/", - "count": ">= 2", - }) - - def posts(self): - url = "{}/{}/image/".format(self.root, self.user) - while url: - url, images, data = self._parse_post(url) - yield data, images - - -class YaplogPostExtractor(YaplogExtractor): - """Extractor for images from a blog post on yaplog.jp""" - subcategory = "post" - pattern = BASE_PATTERN + r"/(?:archive|image)/(\d+)" - test = ( - ("https://yaplog.jp/imamiami0726/image/1299", { - "url": "896cae20fa718735a57e723c48544e830ff31345", - "keyword": "22df8ad6cb534514c6bb2ff000381d156769a620", - }), - # complete image URLs (#443) - ("https://yaplog.jp/msjane/archive/246", { - "pattern": r"https://yaplog.jp/cv/msjane/img/246/img\d+_t.jpg" - }), - # empty post (#443) - ("https://yaplog.jp/f_l_a_s_c_o/image/872", { - "count": 0, - }), - # blog names with '-' (#443) - ("https://yaplog.jp/a-pierrot-o/image/3946/22779"), - ) - - def __init__(self, match): - YaplogExtractor.__init__(self, match) - self.post_id = match.group(2) - - def posts(self): - url = "{}/{}/image/{}".format(self.root, self.user, self.post_id) - _, images, data = self._parse_post(url) - return ((data, images),) diff --git a/gallery_dl/job.py b/gallery_dl/job.py index c717dc2..6ba2572 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -182,7 +182,14 @@ class DownloadJob(Job): self.downloaders = {} self.postprocessors = None self.out = output.select() - self.visited = parent.visited if parent else set() + + if parent: + self.visited = parent.visited + pfmt = parent.pathfmt + if pfmt and parent.extractor.config("parent-directory"): + self.extractor._parentdir = pfmt.directory + else: + self.visited = set() def handle_url(self, url, kwdict, fallback=None): """Download the resource specified in 'url'""" diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 38e2f60..f084950 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -81,6 +81,36 @@ def initialize_logging(loglevel): return logging.getLogger("gallery-dl") +def configure_logging(loglevel): + root = logging.getLogger() + minlevel = loglevel + + # stream logging handler + handler = root.handlers[0] + opts = config.interpolate(("output",), "log") + if opts: + if isinstance(opts, str): + opts = {"format": opts} + if handler.level == LOG_LEVEL and "level" in opts: + handler.setLevel(opts["level"]) + if "format" in opts or "format-date" in opts: + handler.setFormatter(Formatter( + opts.get("format", LOG_FORMAT), + opts.get("format-date", LOG_FORMAT_DATE), + )) + if minlevel > handler.level: + minlevel = handler.level + + # file logging handler + handler = setup_logging_handler("logfile", lvl=loglevel) + if handler: + root.addHandler(handler) + if minlevel > handler.level: + minlevel = handler.level + + root.setLevel(minlevel) + + def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): """Setup a new logging handler""" opts = config.interpolate(("output",), key) @@ -112,22 +142,6 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): return handler -def configure_logging_handler(key, handler): - """Configure a logging handler""" - opts = config.interpolate(("output",), key) - if not opts: - return - if isinstance(opts, str): - opts = {"format": opts} - if handler.level == LOG_LEVEL and "level" in opts: - handler.setLevel(opts["level"]) - if "format" in opts or "format-date" in opts: - handler.setFormatter(Formatter( - opts.get("format", LOG_FORMAT), - opts.get("format-date", LOG_FORMAT_DATE), - )) - - # -------------------------------------------------------------------- # Utility functions diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 162eb9e..706e706 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -42,7 +42,7 @@ class UgoiraPP(PostProcessor): if arg == "-vcodec" or arg in ("-c", "-codec") and ( not stream or stream.partition(":")[0] in ("v", "V")): vcodec = self.args[index + 1] - # use filter if libx264/5 is explicitly or implicitly used + # use filter when using libx264/5 self.prevent_odd = ( vcodec in ("libx264", "libx265") or not vcodec and self.extension.lower() in ("mp4", "mkv")) @@ -91,12 +91,12 @@ class UgoiraPP(PostProcessor): # collect command-line arguments args = [self.ffmpeg] if rate_in: - args += ["-r", str(rate_in)] - args += ["-i", ffconcat] + args += ("-r", str(rate_in)) + args += ("-i", ffconcat) if rate_out: - args += ["-r", str(rate_out)] + args += ("-r", str(rate_out)) if self.prevent_odd: - args += ["-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)"] + args += ("-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)") if self.args: args += self.args self.log.debug("ffmpeg args: %s", args) @@ -106,8 +106,8 @@ class UgoiraPP(PostProcessor): try: if self.twopass: if "-f" not in args: - args += ["-f", self.extension] - args += ["-passlogfile", tempdir + "/ffmpeg2pass", "-pass"] + args += ("-f", self.extension) + args += ("-passlogfile", tempdir + "/ffmpeg2pass", "-pass") self._exec(args + ["1", "-y", os.devnull]) self._exec(args + ["2", pathfmt.realpath]) else: diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 72dad5b..a3f4e0a 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -15,6 +15,8 @@ import datetime import urllib.parse +HTML_RE = re.compile("<[^>]+>") + INVALID_XML_CHARS = ( "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12", @@ -39,7 +41,7 @@ def clean_xml(xmldata, repl=""): def remove_html(txt, repl=" ", sep=" "): """Remove html-tags from a string""" try: - txt = re.sub("<[^>]+>", repl, txt) + txt = HTML_RE.sub(repl, txt) except TypeError: return "" if sep: @@ -51,7 +53,7 @@ def split_html(txt, sep=None): """Split input string by html-tags""" try: return [ - x.strip() for x in re.split("<[^>]+>", txt) + x.strip() for x in HTML_RE.split(txt) if x and not x.isspace() ] except TypeError: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 13bf80e..232047c 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -21,6 +21,7 @@ import datetime import operator import itertools import urllib.parse +from http.cookiejar import Cookie from email.utils import mktime_tz, parsedate_tz from . import text, exception @@ -135,6 +136,67 @@ def remove_directory(path): pass +def load_cookiestxt(fp): + """Parse a Netscape cookies.txt file and return a list of its Cookies""" + cookies = [] + + for line in fp: + + line = line.lstrip() + # strip '#HttpOnly_' + if line.startswith("#HttpOnly_"): + line = line[10:] + # ignore empty lines and comments + if not line or line[0] in ("#", "$"): + continue + # strip trailing '\n' + if line[-1] == "\n": + line = line[:-1] + + domain, domain_specified, path, secure, expires, name, value = \ + line.split("\t") + if not name: + name = value + value = None + + cookies.append(Cookie( + 0, name, value, + None, False, + domain, + domain_specified == "TRUE", + domain.startswith("."), + path, False, + secure == "TRUE", + None if expires == "0" or not expires else expires, + False, None, None, {}, + )) + + return cookies + + +def save_cookiestxt(fp, cookies): + """Write 'cookies' in Netscape cookies.txt format to 'fp'""" + fp.write("# Netscape HTTP Cookie File\n\n") + + for cookie in cookies: + if cookie.value is None: + name = "" + value = cookie.name + else: + name = cookie.name + value = cookie.value + + fp.write("\t".join(( + cookie.domain, + "TRUE" if cookie.domain.startswith(".") else "FALSE", + cookie.path, + "TRUE" if cookie.secure else "FALSE", + "0" if cookie.expires is None else str(cookie.expires), + name, + value, + )) + "\n") + + def code_to_language(code, default=None): """Map an ISO 639-1 language code to its actual name""" return CODES.get((code or "").lower(), default) @@ -419,63 +481,85 @@ class Formatter(): self.format_map = self.fields[0][1] else: self.format_map = lambda _: format_string - del self.result - del self.fields + del self.result, self.fields - def format_map(self, kwargs): - """Apply 'kwargs' to the initial format_string and return its result""" + def format_map(self, kwdict): + """Apply 'kwdict' to the initial format_string and return its result""" + result = self.result for index, func in self.fields: - self.result[index] = func(kwargs) - return "".join(self.result) + result[index] = func(kwdict) + return "".join(result) def _field_access(self, field_name, format_spec, conversion): - first, rest = _string.formatter_field_name_split(field_name) + fmt = self._parse_format_spec(format_spec, conversion) + + if "|" in field_name: + return self._apply_list([ + self._parse_field_name(fn) + for fn in field_name.split("|") + ], fmt) + else: + key, funcs = self._parse_field_name(field_name) + if funcs: + return self._apply(key, funcs, fmt) + return self._apply_simple(key, fmt) + @staticmethod + def _parse_field_name(field_name): + first, rest = _string.formatter_field_name_split(field_name) funcs = [] + for is_attr, key in rest: if is_attr: func = operator.attrgetter - elif ":" in key: - func = self._slicegetter else: func = operator.itemgetter + try: + if ":" in key: + start, _, stop = key.partition(":") + stop, _, step = stop.partition(":") + start = int(start) if start else None + stop = int(stop) if stop else None + step = int(step) if step else None + key = slice(start, stop, step) + except TypeError: + pass # key is an integer + funcs.append(func(key)) - if conversion: - funcs.append(self.CONVERSIONS[conversion]) + return first, funcs - if format_spec: - if format_spec[0] == "?": - func = self._format_optional - elif format_spec[0] == "L": - func = self._format_maxlen - elif format_spec[0] == "J": - func = self._format_join - elif format_spec[0] == "R": - func = self._format_replace - else: - func = self._format_default - fmt = func(format_spec) - else: - fmt = str + def _parse_format_spec(self, format_spec, conversion): + fmt = self._build_format_func(format_spec) + if not conversion: + return fmt - if funcs: - return self._apply(first, funcs, fmt) - return self._apply_simple(first, fmt) + conversion = self.CONVERSIONS[conversion] + if fmt is format: + return conversion + else: + def chain(obj): + return fmt(conversion(obj)) + return chain - def _apply_simple(self, key, fmt): - def wrap(obj): - if key in obj: - obj = obj[key] - else: - obj = self.default - return fmt(obj) - return wrap + def _build_format_func(self, format_spec): + if format_spec: + fmt = format_spec[0] + if fmt == "?": + return self._parse_optional(format_spec) + if fmt == "L": + return self._parse_maxlen(format_spec) + if fmt == "J": + return self._parse_join(format_spec) + if fmt == "R": + return self._parse_replace(format_spec) + return self._default_format(format_spec) + return format def _apply(self, key, funcs, fmt): - def wrap(obj): + def wrap(kwdict): try: - obj = obj[key] + obj = kwdict[key] for func in funcs: obj = func(obj) except Exception: @@ -483,54 +567,66 @@ class Formatter(): return fmt(obj) return wrap - @staticmethod - def _slicegetter(key): - start, _, stop = key.partition(":") - stop, _, step = stop.partition(":") - start = int(start) if start else None - stop = int(stop) if stop else None - step = int(step) if step else None - return operator.itemgetter(slice(start, stop, step)) + def _apply_simple(self, key, fmt): + def wrap(kwdict): + return fmt(kwdict[key] if key in kwdict else self.default) + return wrap - @staticmethod - def _format_optional(format_spec): - def wrap(obj): - if not obj: - return "" - return before + format(obj, format_spec) + after + def _apply_list(self, lst, fmt): + def wrap(kwdict): + for key, funcs in lst: + try: + obj = kwdict[key] + for func in funcs: + obj = func(obj) + if obj is not None: + break + except Exception: + pass + else: + obj = self.default + return fmt(obj) + return wrap + + def _parse_optional(self, format_spec): before, after, format_spec = format_spec.split("/", 2) before = before[1:] - return wrap + fmt = self._build_format_func(format_spec) - @staticmethod - def _format_maxlen(format_spec): - def wrap(obj): - obj = format(obj, format_spec) - return obj if len(obj) <= maxlen else replacement + def optional(obj): + return before + fmt(obj) + after if obj else "" + return optional + + def _parse_maxlen(self, format_spec): maxlen, replacement, format_spec = format_spec.split("/", 2) maxlen = text.parse_int(maxlen[1:]) - return wrap + fmt = self._build_format_func(format_spec) - @staticmethod - def _format_join(format_spec): - def wrap(obj): - obj = separator.join(obj) - return format(obj, format_spec) + def mlen(obj): + obj = fmt(obj) + return obj if len(obj) <= maxlen else replacement + return mlen + + def _parse_join(self, format_spec): separator, _, format_spec = format_spec.partition("/") separator = separator[1:] - return wrap + fmt = self._build_format_func(format_spec) - @staticmethod - def _format_replace(format_spec): - def wrap(obj): - obj = obj.replace(old, new) - return format(obj, format_spec) + def join(obj): + return fmt(separator.join(obj)) + return join + + def _parse_replace(self, format_spec): old, new, format_spec = format_spec.split("/", 2) old = old[1:] - return wrap + fmt = self._build_format_func(format_spec) + + def replace(obj): + return fmt(obj.replace(old, new)) + return replace @staticmethod - def _format_default(format_spec): + def _default_format(format_spec): def wrap(obj): return format(obj, format_spec) return wrap @@ -565,12 +661,14 @@ class PathFormat(): self.delete = False self.path = self.realpath = self.temppath = "" - basedir = expand_path( - extractor.config("base-directory", (".", "gallery-dl"))) - if os.altsep and os.altsep in basedir: - basedir = basedir.replace(os.altsep, os.sep) - if basedir[-1] != os.sep: - basedir += os.sep + basedir = extractor._parentdir + if not basedir: + basedir = expand_path( + extractor.config("base-directory", (".", "gallery-dl"))) + if os.altsep and os.altsep in basedir: + basedir = basedir.replace(os.altsep, os.sep) + if basedir[-1] != os.sep: + basedir += os.sep self.basedirectory = basedir restrict = extractor.config("path-restrict", "auto") diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 36d729e..9171f15 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.12.3" +__version__ = "1.13.2" diff --git a/test/test_cache.py b/test/test_cache.py new file mode 100644 index 0000000..31ece7e --- /dev/null +++ b/test/test_cache.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import unittest +import tempfile +import time + +from gallery_dl import config, util +dbpath = tempfile.mkstemp()[1] +config.set(("cache",), "file", dbpath) +from gallery_dl import cache # noqa + + +def tearDownModule(): + util.remove_file(dbpath) + + +class TestCache(unittest.TestCase): + + def test_decorator(self): + + @cache.memcache() + def mc1(): + pass + + @cache.memcache(maxage=10) + def mc2(): + pass + + @cache.cache() + def dbc(): + pass + + self.assertIsInstance(mc1, cache.CacheDecorator) + self.assertIsInstance(mc2, cache.MemoryCacheDecorator) + self.assertIsInstance(dbc, cache.DatabaseCacheDecorator) + + def test_keyarg_mem_simple(self): + @cache.memcache(keyarg=2) + def ka(a, b, c): + return a+b+c + + self.assertEqual(ka(1, 1, 1), 3) + self.assertEqual(ka(2, 2, 2), 6) + + self.assertEqual(ka(0, 0, 1), 3) + self.assertEqual(ka(9, 9, 1), 3) + self.assertEqual(ka(0, 0, 2), 6) + self.assertEqual(ka(9, 9, 2), 6) + + def test_keyarg_mem(self): + @cache.memcache(keyarg=2, maxage=10) + def ka(a, b, c): + return a+b+c + + self.assertEqual(ka(1, 1, 1), 3) + self.assertEqual(ka(2, 2, 2), 6) + + self.assertEqual(ka(0, 0, 1), 3) + self.assertEqual(ka(9, 9, 1), 3) + self.assertEqual(ka(0, 0, 2), 6) + self.assertEqual(ka(9, 9, 2), 6) + + def test_keyarg_db(self): + @cache.cache(keyarg=2, maxage=10) + def ka(a, b, c): + return a+b+c + + self.assertEqual(ka(1, 1, 1), 3) + self.assertEqual(ka(2, 2, 2), 6) + + self.assertEqual(ka(0, 0, 1), 3) + self.assertEqual(ka(9, 9, 1), 3) + self.assertEqual(ka(0, 0, 2), 6) + self.assertEqual(ka(9, 9, 2), 6) + + def test_expires_mem(self): + @cache.memcache(maxage=1) + def ex(a, b, c): + return a+b+c + + self.assertEqual(ex(1, 1, 1), 3) + self.assertEqual(ex(2, 2, 2), 3) + self.assertEqual(ex(3, 3, 3), 3) + + time.sleep(2) + self.assertEqual(ex(3, 3, 3), 9) + self.assertEqual(ex(2, 2, 2), 9) + self.assertEqual(ex(1, 1, 1), 9) + + def test_expires_db(self): + @cache.cache(maxage=1) + def ex(a, b, c): + return a+b+c + + self.assertEqual(ex(1, 1, 1), 3) + self.assertEqual(ex(2, 2, 2), 3) + self.assertEqual(ex(3, 3, 3), 3) + + time.sleep(2) + self.assertEqual(ex(3, 3, 3), 9) + self.assertEqual(ex(2, 2, 2), 9) + self.assertEqual(ex(1, 1, 1), 9) + + def test_update_mem_simple(self): + @cache.memcache(keyarg=0) + def up(a, b, c): + return a+b+c + + self.assertEqual(up(1, 1, 1), 3) + up.update(1, 0) + up.update(2, 9) + self.assertEqual(up(1, 0, 0), 0) + self.assertEqual(up(2, 0, 0), 9) + + def test_update_mem(self): + @cache.memcache(keyarg=0, maxage=10) + def up(a, b, c): + return a+b+c + + self.assertEqual(up(1, 1, 1), 3) + up.update(1, 0) + up.update(2, 9) + self.assertEqual(up(1, 0, 0), 0) + self.assertEqual(up(2, 0, 0), 9) + + def test_update_db(self): + @cache.cache(keyarg=0, maxage=10) + def up(a, b, c): + return a+b+c + + self.assertEqual(up(1, 1, 1), 3) + up.update(1, 0) + up.update(2, 9) + self.assertEqual(up(1, 0, 0), 0) + self.assertEqual(up(2, 0, 0), 9) + + def test_invalidate_mem_simple(self): + @cache.memcache(keyarg=0) + def inv(a, b, c): + return a+b+c + + self.assertEqual(inv(1, 1, 1), 3) + inv.invalidate(1) + inv.invalidate(2) + self.assertEqual(inv(1, 0, 0), 1) + self.assertEqual(inv(2, 0, 0), 2) + + def test_invalidate_mem(self): + @cache.memcache(keyarg=0, maxage=10) + def inv(a, b, c): + return a+b+c + + self.assertEqual(inv(1, 1, 1), 3) + inv.invalidate(1) + inv.invalidate(2) + self.assertEqual(inv(1, 0, 0), 1) + self.assertEqual(inv(2, 0, 0), 2) + + def test_invalidate_db(self): + @cache.cache(keyarg=0, maxage=10) + def inv(a, b, c): + return a+b+c + + self.assertEqual(inv(1, 1, 1), 3) + inv.invalidate(1) + inv.invalidate(2) + self.assertEqual(inv(1, 0, 0), 1) + self.assertEqual(inv(2, 0, 0), 2) + + def test_database_read(self): + @cache.cache(keyarg=0, maxage=10) + def db(a, b, c): + return a+b+c + + # initialize cache + self.assertEqual(db(1, 1, 1), 3) + db.update(2, 6) + + # check and clear the in-memory portion of said cache + self.assertEqual(db.cache[1][0], 3) + self.assertEqual(db.cache[2][0], 6) + db.cache.clear() + self.assertEqual(db.cache, {}) + + # fetch results from database + self.assertEqual(db(1, 0, 0), 3) + self.assertEqual(db(2, 0, 0), 6) + + # check in-memory cache updates + self.assertEqual(db.cache[1][0], 3) + self.assertEqual(db.cache[2][0], 6) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_cookies.py b/test/test_cookies.py index 4f294bf..c39a5e6 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,7 +12,6 @@ from unittest import mock import logging import tempfile -import http.cookiejar from os.path import join import gallery_dl.config as config @@ -34,7 +33,7 @@ class TestCookiejar(unittest.TestCase): cls.invalid_cookiefile = join(cls.path.name, "invalid.txt") with open(cls.invalid_cookiefile, "w") as file: file.write("""# asd -.example.org\tTRUE\t/\tFALSE\t253402210800\tNAME\tVALUE +.example.org\tTRUE/FALSE\t253402210800\tNAME\tVALUE """) @classmethod @@ -55,7 +54,7 @@ class TestCookiejar(unittest.TestCase): self.assertEqual(cookie.value , "VALUE") def test_invalid_cookiefile(self): - self._test_warning(self.invalid_cookiefile, http.cookiejar.LoadError) + self._test_warning(self.invalid_cookiefile, ValueError) def test_invalid_filename(self): self._test_warning(join(self.path.name, "nothing"), FileNotFoundError) diff --git a/test/test_results.py b/test/test_results.py index e87b4b8..538abfa 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -12,6 +12,7 @@ import sys import re import json import hashlib +import datetime import unittest from gallery_dl import extractor, util, job, config, exception @@ -21,14 +22,17 @@ TRAVIS_SKIP = { "exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie", "bobx", "archivedmoe", "archiveofsins", "thebarchive", "fireden", "4plebs", "sankaku", "idolcomplex", "mangahere", "readcomiconline", "mangadex", - "sankakucomplex", "warosu", "fuskator", "patreon", + "sankakucomplex", "warosu", "fuskator", "patreon", "komikcast", } # temporary issues, etc. BROKEN = { - "imxto", + "35photo", "mangapark", "photobucket", + "sexcom", + "hentaicafe", + "worldthree", } @@ -154,6 +158,9 @@ class TestExtractorResults(unittest.TestCase): elif isinstance(test, str): if test.startswith("re:"): self.assertRegex(value, test[3:], msg=key) + elif test.startswith("dt:"): + self.assertIsInstance(value, datetime.datetime, msg=key) + self.assertEqual(str(value), test[3:], msg=key) elif test.startswith("type:"): self.assertEqual(type(value).__name__, test[5:], msg=key) else: @@ -267,7 +274,7 @@ class TestFormatter(util.Formatter): return "" def _apply_simple(self, key, fmt): - if key == "extension" or "._format_optional." in repr(fmt): + if key == "extension" or "._parse_optional." in repr(fmt): return self._noop def wrap(obj): @@ -275,7 +282,7 @@ class TestFormatter(util.Formatter): return wrap def _apply(self, key, funcs, fmt): - if key == "extension" or "._format_optional." in repr(fmt): + if key == "extension" or "._parse_optional." in repr(fmt): return self._noop def wrap(obj): @@ -301,6 +308,7 @@ def setup_test_config(): config.set(("extractor", "nijie") , "username", email) config.set(("extractor", "seiga") , "username", email) config.set(("extractor", "danbooru") , "username", None) + config.set(("extractor", "e621") , "username", None) config.set(("extractor", "instagram") , "username", None) config.set(("extractor", "twitter") , "username", None) diff --git a/test/test_util.py b/test/test_util.py index 5a103cf..ffabd37 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,10 @@ import unittest import sys +import io import random import string +import http.cookiejar from gallery_dl import util, text, exception @@ -158,11 +160,106 @@ class TestISO639_1(unittest.TestCase): self.assertEqual(func(*args), result) +class TestCookiesTxt(unittest.TestCase): + + def test_load_cookiestxt(self): + + def _assert(content, expected): + cookies = util.load_cookiestxt(io.StringIO(content, None)) + for c, e in zip(cookies, expected): + self.assertEqual(c.__dict__, e.__dict__) + + _assert("", []) + _assert("\n\n\n", []) + _assert("$ Comment", []) + _assert("# Comment", []) + _assert(" # Comment \n\n $ Comment ", []) + _assert( + ".example.org\tTRUE\t/\tTRUE\t0\tname\tvalue", + [self._cookie("name", "value", ".example.org")], + ) + _assert( + ".example.org\tTRUE\t/\tTRUE\t\tname\t", + [self._cookie("name", "", ".example.org")], + ) + _assert( + "# Netscape HTTP Cookie File\n" + "\n" + "# default\n" + ".example.org TRUE / FALSE 0 n1 v1\n" + ".example.org TRUE / TRUE 2145945600 n2 v2\n" + ".example.org TRUE /path FALSE 0 n3\n" + "\n" + " # # extra # # \n" + "www.example.org FALSE / FALSE n4 \n" + "www.example.org FALSE /path FALSE 100 n5 v5\n", + [ + self._cookie( + "n1", "v1", ".example.org", True, "/", False), + self._cookie( + "n2", "v2", ".example.org", True, "/", True, 2145945600), + self._cookie( + "n3", None, ".example.org", True, "/path", False), + self._cookie( + "n4", "" , "www.example.org", False, "/", False), + self._cookie( + "n5", "v5", "www.example.org", False, "/path", False, 100), + ], + ) + + with self.assertRaises(ValueError): + util.load_cookiestxt("example.org\tTRUE\t/\tTRUE\t0\tname") + + def test_save_cookiestxt(self): + + def _assert(cookies, expected): + fp = io.StringIO(newline=None) + util.save_cookiestxt(fp, cookies) + self.assertMultiLineEqual(fp.getvalue(), expected) + + _assert([], "# Netscape HTTP Cookie File\n\n") + _assert( + [self._cookie("name", "value", ".example.org")], + "# Netscape HTTP Cookie File\n\n" + ".example.org\tTRUE\t/\tTRUE\t0\tname\tvalue\n", + ) + _assert( + [ + self._cookie( + "n1", "v1", ".example.org", True, "/", False), + self._cookie( + "n2", "v2", ".example.org", True, "/", True, 2145945600), + self._cookie( + "n3", None, ".example.org", True, "/path", False), + self._cookie( + "n4", "" , "www.example.org", False, "/", False), + self._cookie( + "n5", "v5", "www.example.org", False, "/path", False, 100), + ], + "# Netscape HTTP Cookie File\n" + "\n" + ".example.org TRUE / FALSE 0 n1 v1\n" + ".example.org TRUE / TRUE 2145945600 n2 v2\n" + ".example.org TRUE /path FALSE 0 n3\n" + "www.example.org FALSE / FALSE 0 n4 \n" + "www.example.org FALSE /path FALSE 100 n5 v5\n", + ) + + def _cookie(self, name, value, domain, domain_specified=True, + path="/", secure=True, expires=None): + return http.cookiejar.Cookie( + 0, name, value, None, False, + domain, domain_specified, domain.startswith("."), + path, False, secure, expires, False, None, None, {}, + ) + + class TestFormatter(unittest.TestCase): kwdict = { "a": "hElLo wOrLd", "b": "äöü", + "d": {"a": "foo", "b": 0, "c": None}, "l": ["a", "b", "c"], "n": None, "u": "%27%3C%20/%20%3E%27", @@ -227,6 +324,26 @@ class TestFormatter(unittest.TestCase): self._run_test("{missing[key]}", replacement, default) self._run_test("{missing:?a//}", "a" + default, default) + def test_alternative(self): + self._run_test("{a|z}" , "hElLo wOrLd") + self._run_test("{z|a}" , "hElLo wOrLd") + self._run_test("{z|y|a}" , "hElLo wOrLd") + self._run_test("{z|y|x|a}", "hElLo wOrLd") + self._run_test("{z|n|a|y}", "hElLo wOrLd") + + self._run_test("{z|a!C}" , "Hello World") + self._run_test("{z|a:Rh/C/}" , "CElLo wOrLd") + self._run_test("{z|a!C:RH/C/}", "Cello World") + self._run_test("{z|y|x:?</>/}", "") + + self._run_test("{d[c]|d[b]|d[a]}", "0") + self._run_test("{d[a]|d[b]|d[c]}", "foo") + self._run_test("{d[z]|d[y]|d[x]}", "None") + + def test_indexing(self): + self._run_test("{l[0]}" , "a") + self._run_test("{a[6]}" , "w") + def test_slicing(self): v = self.kwdict["a"] self._run_test("{a[1:10]}" , v[1:10]) @@ -273,6 +390,18 @@ class TestFormatter(unittest.TestCase): self._run_test("{a!l:Rl//}" , "heo word") self._run_test("{name:Rame/othing/}", "Nothing") + def test_chain_special(self): + # multiple replacements + self._run_test("{a:Rh/C/RE/e/RL/l/}", "Cello wOrld") + self._run_test("{d[b]!s:R1/Q/R2/A/R0/Y/}", "Y") + + # join-and-replace + self._run_test("{l:J-/Rb/E/}", "a-E-c") + + # optional-and-maxlen + self._run_test("{d[a]:?</>/L1/too long/}", "<too long>") + self._run_test("{d[c]:?</>/L5/too long/}", "") + def _run_test(self, format_string, result, default=None): formatter = util.Formatter(format_string, default) output = formatter.format_map(self.kwdict) |