diff options
Diffstat (limited to 'gallery_dl')
40 files changed, 2479 insertions, 350 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 0214659..d66e285 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -12,7 +12,7 @@ import logging from . import version, config, option, output, extractor, job, util, exception __author__ = "Mike Fährmann" -__copyright__ = "Copyright 2014-2021 Mike Fährmann" +__copyright__ = "Copyright 2014-2022 Mike Fährmann" __license__ = "GPLv2" __maintainer__ = "Mike Fährmann" __email__ = "mike_faehrmann@web.de" @@ -22,10 +22,13 @@ __version__ = version.__version__ def progress(urls, pformat): """Wrapper around urls to output a simple progress indicator""" if pformat is True: - pformat = "[{current}/{total}] {url}" + pformat = "[{current}/{total}] {url}\n" + else: + pformat += "\n" + pinfo = {"total": len(urls)} for pinfo["current"], pinfo["url"] in enumerate(urls, 1): - print(pformat.format_map(pinfo), file=sys.stderr) + output.stderr_write(pformat.format_map(pinfo)) yield pinfo["url"] @@ -116,9 +119,12 @@ def main(): if args.yamlfiles: config.load(args.yamlfiles, strict=True, fmt="yaml") if args.filename: - if args.filename == "/O": - args.filename = "{filename}.{extension}" - config.set((), "filename", args.filename) + filename = args.filename + if filename == "/O": + filename = "{filename}.{extension}" + elif filename.startswith("\\f"): + filename = "\f" + filename[2:] + config.set((), "filename", filename) if args.directory: config.set((), "base-directory", args.directory) config.set((), "directory", ()) @@ -128,6 +134,10 @@ def main(): config.set((), "skip", "abort:" + str(args.abort)) if args.terminate: config.set((), "skip", "terminate:" + str(args.terminate)) + if args.cookies_from_browser: + browser, _, profile = args.cookies_from_browser.partition(":") + browser, _, keyring = browser.partition("+") + config.set((), "cookies", (browser, profile, keyring)) for opts in args.options: config.set(*opts) @@ -189,20 +199,23 @@ def main(): pass if args.list_modules: - for module_name in extractor.modules: - print(module_name) + extractor.modules.append("") + sys.stdout.write("\n".join(extractor.modules)) + elif args.list_extractors: + write = sys.stdout.write + fmt = "{}\n{}\nCategory: {} - Subcategory: {}{}\n\n".format + for extr in extractor.extractors(): if not extr.__doc__: continue - print(extr.__name__) - print(extr.__doc__) - print("Category:", extr.category, - "- Subcategory:", extr.subcategory) test = next(extr._get_tests(), None) - if test: - print("Example :", test[0]) - print() + write(fmt( + extr.__name__, extr.__doc__, + extr.category, extr.subcategory, + "\nExample : " + test[0] if test else "", + )) + elif args.clear_cache: from . import cache log = logging.getLogger("cache") diff --git a/gallery_dl/aes.py b/gallery_dl/aes.py new file mode 100644 index 0000000..22cb052 --- /dev/null +++ b/gallery_dl/aes.py @@ -0,0 +1,641 @@ +# -*- coding: utf-8 -*- + +# This is a slightly modified version of yt-dlp's aes module. +# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/aes.py + +import struct +import binascii +from math import ceil + +try: + from Cryptodome.Cipher import AES as Cryptodome_AES +except ImportError: + try: + from Crypto.Cipher import AES as Cryptodome_AES + except ImportError: + Cryptodome_AES = None + + +if Cryptodome_AES: + def aes_cbc_decrypt_bytes(data, key, iv): + """Decrypt bytes with AES-CBC using pycryptodome""" + return Cryptodome_AES.new( + key, Cryptodome_AES.MODE_CBC, iv).decrypt(data) + + def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): + """Decrypt bytes with AES-GCM using pycryptodome""" + return Cryptodome_AES.new( + key, Cryptodome_AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) +else: + def aes_cbc_decrypt_bytes(data, key, iv): + """Decrypt bytes with AES-CBC using native implementation""" + return intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(data), + bytes_to_intlist(key), + bytes_to_intlist(iv), + )) + + def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): + """Decrypt bytes with AES-GCM using native implementation""" + return intlist_to_bytes(aes_gcm_decrypt_and_verify( + bytes_to_intlist(data), + bytes_to_intlist(key), + bytes_to_intlist(tag), + bytes_to_intlist(nonce), + )) + + +bytes_to_intlist = list + + +def intlist_to_bytes(xs): + if not xs: + return b"" + return struct.pack("%dB" % len(xs), *xs) + + +def unpad_pkcs7(data): + return data[:-data[-1]] + + +BLOCK_SIZE_BYTES = 16 + + +def aes_ecb_encrypt(data, key, iv=None): + """ + Encrypt with aes in ECB mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_encrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_ecb_decrypt(data, key, iv=None): + """ + Decrypt with aes in ECB mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_decrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_ctr_decrypt(data, key, iv): + """ + Decrypt with aes in counter mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte initialization vector + @returns {int[]} decrypted data + """ + return aes_ctr_encrypt(data, key, iv) + + +def aes_ctr_encrypt(data, key, iv): + """ + Encrypt with aes in counter mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte initialization vector + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + counter = iter_vector(iv) + + encrypted_data = [] + for i in range(block_count): + counter_block = next(counter) + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + cipher_counter_block = aes_encrypt(counter_block, expanded_key) + encrypted_data += xor(block, cipher_counter_block) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_cbc_decrypt(data, key, iv): + """ + Decrypt with aes in CBC mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + decrypted_block = aes_decrypt(block, expanded_key) + decrypted_data += xor(decrypted_block, previous_cipher_block) + previous_cipher_block = block + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + + +def aes_cbc_encrypt(data, key, iv): + """ + Encrypt with aes in CBC mode. Using PKCS#7 padding + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + remaining_length = BLOCK_SIZE_BYTES - len(block) + block += [remaining_length] * remaining_length + mixed_block = xor(block, previous_cipher_block) + + encrypted_block = aes_encrypt(mixed_block, expanded_key) + encrypted_data += encrypted_block + + previous_cipher_block = encrypted_block + + return encrypted_data + + +def aes_gcm_decrypt_and_verify(data, key, tag, nonce): + """ + Decrypt with aes in GBM mode and checks authenticity using tag + + @param {int[]} data cipher + @param {int[]} key 16-Byte cipher key + @param {int[]} tag authentication tag + @param {int[]} nonce IV (recommended 12-Byte) + @returns {int[]} decrypted data + """ + + # XXX: check aes, gcm param + + hash_subkey = aes_encrypt([0] * BLOCK_SIZE_BYTES, key_expansion(key)) + + if len(nonce) == 12: + j0 = nonce + [0, 0, 0, 1] + else: + fill = (BLOCK_SIZE_BYTES - (len(nonce) % BLOCK_SIZE_BYTES)) % \ + BLOCK_SIZE_BYTES + 8 + ghash_in = nonce + [0] * fill + bytes_to_intlist( + (8 * len(nonce)).to_bytes(8, "big")) + j0 = ghash(hash_subkey, ghash_in) + + # TODO: add nonce support to aes_ctr_decrypt + + # nonce_ctr = j0[:12] + iv_ctr = inc(j0) + + decrypted_data = aes_ctr_decrypt( + data, key, iv_ctr + [0] * (BLOCK_SIZE_BYTES - len(iv_ctr))) + + pad_len = len(data) // 16 * 16 + s_tag = ghash( + hash_subkey, + data + + [0] * (BLOCK_SIZE_BYTES - len(data) + pad_len) + # pad + bytes_to_intlist( + (0 * 8).to_bytes(8, "big") + # length of associated data + ((len(data) * 8).to_bytes(8, "big")) # length of data + ) + ) + + if tag != aes_ctr_encrypt(s_tag, key, j0): + raise ValueError("Mismatching authentication tag") + + return decrypted_data + + +def aes_encrypt(data, expanded_key): + """ + Encrypt one block with aes + + @param {int[]} data 16-Byte state + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte cipher + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + for i in range(1, rounds + 1): + data = sub_bytes(data) + data = shift_rows(data) + if i != rounds: + data = list(iter_mix_columns(data, MIX_COLUMN_MATRIX)) + data = xor(data, expanded_key[ + i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) + + return data + + +def aes_decrypt(data, expanded_key): + """ + Decrypt one block with aes + + @param {int[]} data 16-Byte cipher + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte state + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + for i in range(rounds, 0, -1): + data = xor(data, expanded_key[ + i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) + if i != rounds: + data = list(iter_mix_columns(data, MIX_COLUMN_MATRIX_INV)) + data = shift_rows_inv(data) + data = sub_bytes_inv(data) + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + + return data + + +def aes_decrypt_text(data, password, key_size_bytes): + """ + Decrypt text + - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter + - The cipher key is retrieved by encrypting the first 16 Byte of 'password' + with the first 'key_size_bytes' Bytes from 'password' + (if necessary filled with 0's) + - Mode of operation is 'counter' + + @param {str} data Base64 encoded string + @param {str,unicode} password Password (will be encoded with utf-8) + @param {int} key_size_bytes Possible values: 16 for 128-Bit, + 24 for 192-Bit, or + 32 for 256-Bit + @returns {str} Decrypted data + """ + NONCE_LENGTH_BYTES = 8 + + data = bytes_to_intlist(binascii.a2b_base64(data)) + password = bytes_to_intlist(password.encode("utf-8")) + + key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) + key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * \ + (key_size_bytes // BLOCK_SIZE_BYTES) + + nonce = data[:NONCE_LENGTH_BYTES] + cipher = data[NONCE_LENGTH_BYTES:] + + return intlist_to_bytes(aes_ctr_decrypt( + cipher, key, nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES) + )) + + +RCON = ( + 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, +) + +SBOX = ( + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, + 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, + 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, + 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, + 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, + 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, + 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, + 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, + 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, + 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, + 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, + 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, + 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, + 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, + 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, + 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, + 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16, +) + +SBOX_INV = ( + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, + 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, + 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, + 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, + 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, + 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, + 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, + 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, + 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, + 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, + 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +) + +MIX_COLUMN_MATRIX = ( + (0x2, 0x3, 0x1, 0x1), + (0x1, 0x2, 0x3, 0x1), + (0x1, 0x1, 0x2, 0x3), + (0x3, 0x1, 0x1, 0x2), +) + +MIX_COLUMN_MATRIX_INV = ( + (0xE, 0xB, 0xD, 0x9), + (0x9, 0xE, 0xB, 0xD), + (0xD, 0x9, 0xE, 0xB), + (0xB, 0xD, 0x9, 0xE), +) + +RIJNDAEL_EXP_TABLE = ( + 0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, + 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, + 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, + 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, + 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, + 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, + 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, + 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, + 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, + 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, + 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, + 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, + 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, + 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, + 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, + 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, + 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, + 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, + 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, + 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, + 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, + 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, + 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, + 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, + 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, + 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, + 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, + 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, + 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, + 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, + 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, + 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01, +) + +RIJNDAEL_LOG_TABLE = ( + 0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, + 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, + 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, + 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, + 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, + 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, + 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, + 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, + 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, + 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, + 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, + 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, + 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, + 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, + 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, + 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, + 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, + 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, + 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, + 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, + 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, + 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, + 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, + 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, + 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, + 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, + 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, + 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, + 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, + 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, + 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, + 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07, +) + + +def key_expansion(data): + """ + Generate key schedule + + @param {int[]} data 16/24/32-Byte cipher key + @returns {int[]} 176/208/240-Byte expanded key + """ + data = data[:] # copy + rcon_iteration = 1 + key_size_bytes = len(data) + expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES + + while len(data) < expanded_key_size_bytes: + temp = data[-4:] + temp = key_schedule_core(temp, rcon_iteration) + rcon_iteration += 1 + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + if key_size_bytes == 32: + temp = data[-4:] + temp = sub_bytes(temp) + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3 if key_size_bytes == 32 else + 2 if key_size_bytes == 24 else 0): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + data = data[:expanded_key_size_bytes] + + return data + + +def iter_vector(iv): + while True: + yield iv + iv = inc(iv) + + +def sub_bytes(data): + return [SBOX[x] for x in data] + + +def sub_bytes_inv(data): + return [SBOX_INV[x] for x in data] + + +def rotate(data): + return data[1:] + [data[0]] + + +def key_schedule_core(data, rcon_iteration): + data = rotate(data) + data = sub_bytes(data) + data[0] = data[0] ^ RCON[rcon_iteration] + + return data + + +def xor(data1, data2): + return [x ^ y for x, y in zip(data1, data2)] + + +def iter_mix_columns(data, matrix): + for i in (0, 4, 8, 12): + for row in matrix: + mixed = 0 + for j in range(4): + if data[i:i + 4][j] == 0 or row[j] == 0: + mixed ^= 0 + else: + mixed ^= RIJNDAEL_EXP_TABLE[ + (RIJNDAEL_LOG_TABLE[data[i + j]] + + RIJNDAEL_LOG_TABLE[row[j]]) % 0xFF + ] + yield mixed + + +def shift_rows(data): + return [ + data[((column + row) & 0b11) * 4 + row] + for column in range(4) + for row in range(4) + ] + + +def shift_rows_inv(data): + return [ + data[((column - row) & 0b11) * 4 + row] + for column in range(4) + for row in range(4) + ] + + +def shift_block(data): + data_shifted = [] + + bit = 0 + for n in data: + if bit: + n |= 0x100 + bit = n & 1 + n >>= 1 + data_shifted.append(n) + + return data_shifted + + +def inc(data): + data = data[:] # copy + for i in range(len(data) - 1, -1, -1): + if data[i] == 255: + data[i] = 0 + else: + data[i] = data[i] + 1 + break + return data + + +def block_product(block_x, block_y): + # NIST SP 800-38D, Algorithm 1 + + if len(block_x) != BLOCK_SIZE_BYTES or len(block_y) != BLOCK_SIZE_BYTES: + raise ValueError( + "Length of blocks need to be %d bytes" % BLOCK_SIZE_BYTES) + + block_r = [0xE1] + [0] * (BLOCK_SIZE_BYTES - 1) + block_v = block_y[:] + block_z = [0] * BLOCK_SIZE_BYTES + + for i in block_x: + for bit in range(7, -1, -1): + if i & (1 << bit): + block_z = xor(block_z, block_v) + + do_xor = block_v[-1] & 1 + block_v = shift_block(block_v) + if do_xor: + block_v = xor(block_v, block_r) + + return block_z + + +def ghash(subkey, data): + # NIST SP 800-38D, Algorithm 2 + + if len(data) % BLOCK_SIZE_BYTES: + raise ValueError( + "Length of data should be %d bytes" % BLOCK_SIZE_BYTES) + + last_y = [0] * BLOCK_SIZE_BYTES + for i in range(0, len(data), BLOCK_SIZE_BYTES): + block = data[i: i + BLOCK_SIZE_BYTES] + last_y = block_product(xor(last_y, block), subkey) + + return last_y diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py new file mode 100644 index 0000000..b173a30 --- /dev/null +++ b/gallery_dl/cookies.py @@ -0,0 +1,956 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +# Adapted from yt-dlp's cookies module. +# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/cookies.py + +import binascii +import contextlib +import ctypes +import json +import logging +import os +import shutil +import sqlite3 +import struct +import subprocess +import sys +import tempfile +from datetime import datetime, timedelta, timezone +from hashlib import pbkdf2_hmac +from http.cookiejar import Cookie +from . import aes + + +SUPPORTED_BROWSERS_CHROMIUM = { + "brave", "chrome", "chromium", "edge", "opera", "vivaldi"} +SUPPORTED_BROWSERS = SUPPORTED_BROWSERS_CHROMIUM | {"firefox", "safari"} + +logger = logging.getLogger("cookies") + + +def load_cookies(cookiejar, browser_specification): + browser_name, profile, keyring = \ + _parse_browser_specification(*browser_specification) + + if browser_name == "firefox": + load_cookies_firefox(cookiejar, profile) + elif browser_name == "safari": + load_cookies_safari(cookiejar, profile) + elif browser_name in SUPPORTED_BROWSERS_CHROMIUM: + load_cookies_chrome(cookiejar, browser_name, profile, keyring) + else: + raise ValueError("unknown browser '{}'".format(browser_name)) + + +def load_cookies_firefox(cookiejar, profile=None): + set_cookie = cookiejar.set_cookie + with _firefox_cookies_database(profile) as db: + for name, value, domain, path, secure, expires in db.execute( + "SELECT name, value, host, path, isSecure, expiry " + "FROM moz_cookies"): + set_cookie(Cookie( + 0, name, value, None, False, + domain, bool(domain), domain.startswith("."), + path, bool(path), secure, expires, False, None, None, {}, + )) + + +def load_cookies_safari(cookiejar, profile=None): + """Ref.: https://github.com/libyal/dtformats/blob + /main/documentation/Safari%20Cookies.asciidoc + - This data appears to be out of date + but the important parts of the database structure is the same + - There are a few bytes here and there + which are skipped during parsing + """ + with _safari_cookies_database() as fp: + data = fp.read() + page_sizes, body_start = _safari_parse_cookies_header(data) + p = DataParser(data[body_start:]) + for page_size in page_sizes: + _safari_parse_cookies_page(p.read_bytes(page_size), cookiejar) + + +def load_cookies_chrome(cookiejar, browser_name, profile, keyring): + config = _get_chromium_based_browser_settings(browser_name) + + with _chrome_cookies_database(profile, config) as db: + + db.text_factory = bytes + decryptor = get_cookie_decryptor( + config["directory"], config["keyring"], keyring=keyring) + + try: + rows = db.execute( + "SELECT host_key, name, value, encrypted_value, path, " + "expires_utc, is_secure FROM cookies") + except sqlite3.OperationalError: + rows = db.execute( + "SELECT host_key, name, value, encrypted_value, path, " + "expires_utc, secure FROM cookies") + + set_cookie = cookiejar.set_cookie + failed_cookies = unencrypted_cookies = 0 + + for domain, name, value, enc_value, path, expires, secure in rows: + + if not value and enc_value: # encrypted + value = decryptor.decrypt(enc_value) + if value is None: + failed_cookies += 1 + continue + else: + value = value.decode() + unencrypted_cookies += 1 + + domain = domain.decode() + path = path.decode() + name = name.decode() + + set_cookie(Cookie( + 0, name, value, None, False, + domain, bool(domain), domain.startswith("."), + path, bool(path), secure, expires, False, None, None, {}, + )) + + if failed_cookies > 0: + failed_message = " ({} could not be decrypted)".format(failed_cookies) + else: + failed_message = "" + + logger.info("Extracted %s cookies from %s%s", + len(cookiejar), browser_name, failed_message) + counts = decryptor.cookie_counts.copy() + counts["unencrypted"] = unencrypted_cookies + logger.debug("cookie version breakdown: %s", counts) + + +# -------------------------------------------------------------------- +# firefox + +def _firefox_cookies_database(profile=None): + if profile is None: + search_root = _firefox_browser_directory() + elif _is_path(profile): + search_root = profile + else: + search_root = os.path.join(_firefox_browser_directory(), profile) + + path = _find_most_recently_used_file(search_root, "cookies.sqlite") + if path is None: + raise FileNotFoundError("Unable to find Firefox cookies database in " + "{}".format(search_root)) + + logger.debug("Extracting cookies from %s", path) + return DatabaseCopy(path) + + +def _firefox_browser_directory(): + if sys.platform in ("linux", "linux2"): + return os.path.expanduser("~/.mozilla/firefox") + if sys.platform == "win32": + return os.path.expandvars(R"%APPDATA%\Mozilla\Firefox\Profiles") + if sys.platform == "darwin": + return os.path.expanduser("~/Library/Application Support/Firefox") + raise ValueError("unsupported platform '{}'".format(sys.platform)) + + +# -------------------------------------------------------------------- +# safari + +def _safari_cookies_database(): + try: + path = os.path.expanduser("~/Library/Cookies/Cookies.binarycookies") + return open(path, "rb") + except FileNotFoundError: + logger.debug("Trying secondary cookie location") + path = os.path.expanduser("~/Library/Containers/com.apple.Safari/Data" + "/Library/Cookies/Cookies.binarycookies") + return open(path, "rb") + + +def _safari_parse_cookies_header(data): + p = DataParser(data) + p.expect_bytes(b"cook", "database signature") + number_of_pages = p.read_uint(big_endian=True) + page_sizes = [p.read_uint(big_endian=True) + for _ in range(number_of_pages)] + return page_sizes, p.cursor + + +def _safari_parse_cookies_page(data, jar): + p = DataParser(data) + p.expect_bytes(b"\x00\x00\x01\x00", "page signature") + number_of_cookies = p.read_uint() + record_offsets = [p.read_uint() for _ in range(number_of_cookies)] + if number_of_cookies == 0: + logger.debug("a cookies page of size %s has no cookies", len(data)) + return + + p.skip_to(record_offsets[0], "unknown page header field") + + for i, record_offset in enumerate(record_offsets): + p.skip_to(record_offset, "space between records") + record_length = _safari_parse_cookies_record( + data[record_offset:], jar) + p.read_bytes(record_length) + p.skip_to_end("space in between pages") + + +def _safari_parse_cookies_record(data, cookiejar): + p = DataParser(data) + record_size = p.read_uint() + p.skip(4, "unknown record field 1") + flags = p.read_uint() + is_secure = bool(flags & 0x0001) + p.skip(4, "unknown record field 2") + domain_offset = p.read_uint() + name_offset = p.read_uint() + path_offset = p.read_uint() + value_offset = p.read_uint() + p.skip(8, "unknown record field 3") + expiration_date = _mac_absolute_time_to_posix(p.read_double()) + _creation_date = _mac_absolute_time_to_posix(p.read_double()) # noqa: F841 + + try: + p.skip_to(domain_offset) + domain = p.read_cstring() + + p.skip_to(name_offset) + name = p.read_cstring() + + p.skip_to(path_offset) + path = p.read_cstring() + + p.skip_to(value_offset) + value = p.read_cstring() + except UnicodeDecodeError: + logger.warning("failed to parse Safari cookie " + "because UTF-8 decoding failed") + return record_size + + p.skip_to(record_size, "space at the end of the record") + + cookiejar.set_cookie(Cookie( + 0, name, value, None, False, + domain, bool(domain), domain.startswith('.'), + path, bool(path), is_secure, expiration_date, False, + None, None, {}, + )) + + return record_size + + +# -------------------------------------------------------------------- +# chrome + +def _chrome_cookies_database(profile, config): + if profile is None: + search_root = config["directory"] + elif _is_path(profile): + search_root = profile + config["directory"] = (os.path.dirname(profile) + if config["profiles"] else profile) + elif config["profiles"]: + search_root = os.path.join(config["directory"], profile) + else: + logger.warning("%s does not support profiles", config["browser"]) + search_root = config["directory"] + + path = _find_most_recently_used_file(search_root, "Cookies") + if path is None: + raise FileNotFoundError("Unable tp find {} cookies database in " + "'{}'".format(config["browser"], search_root)) + + logger.debug("Extracting cookies from %s", path) + return DatabaseCopy(path) + + +def _get_chromium_based_browser_settings(browser_name): + # https://chromium.googlesource.com/chromium + # /src/+/HEAD/docs/user_data_dir.md + join = os.path.join + + if sys.platform in ("linux", "linux2"): + config = (os.environ.get("XDG_CONFIG_HOME") or + os.path.expanduser("~/.config")) + + browser_dir = { + "brave" : join(config, "BraveSoftware/Brave-Browser"), + "chrome" : join(config, "google-chrome"), + "chromium": join(config, "chromium"), + "edge" : join(config, "microsoft-edge"), + "opera" : join(config, "opera"), + "vivaldi" : join(config, "vivaldi"), + }[browser_name] + + elif sys.platform == "win32": + appdata_local = os.path.expandvars("%LOCALAPPDATA%") + appdata_roaming = os.path.expandvars("%APPDATA%") + browser_dir = { + "brave" : join(appdata_local, + R"BraveSoftware\Brave-Browser\User Data"), + "chrome" : join(appdata_local, R"Google\Chrome\User Data"), + "chromium": join(appdata_local, R"Chromium\User Data"), + "edge" : join(appdata_local, R"Microsoft\Edge\User Data"), + "opera" : join(appdata_roaming, R"Opera Software\Opera Stable"), + "vivaldi" : join(appdata_local, R"Vivaldi\User Data"), + }[browser_name] + + elif sys.platform == "darwin": + appdata = os.path.expanduser("~/Library/Application Support") + browser_dir = { + "brave" : join(appdata, "BraveSoftware/Brave-Browser"), + "chrome" : join(appdata, "Google/Chrome"), + "chromium": join(appdata, "Chromium"), + "edge" : join(appdata, "Microsoft Edge"), + "opera" : join(appdata, "com.operasoftware.Opera"), + "vivaldi" : join(appdata, "Vivaldi"), + }[browser_name] + + else: + raise ValueError("unsupported platform '{}'".format(sys.platform)) + + # Linux keyring names can be determined by snooping on dbus + # while opening the browser in KDE: + # dbus-monitor "interface="org.kde.KWallet"" "type=method_return" + keyring_name = { + "brave" : "Brave", + "chrome" : "Chrome", + "chromium": "Chromium", + "edge" : "Microsoft Edge" if sys.platform == "darwin" else + "Chromium", + "opera" : "Opera" if sys.platform == "darwin" else "Chromium", + "vivaldi" : "Vivaldi" if sys.platform == "darwin" else "Chrome", + }[browser_name] + + browsers_without_profiles = {"opera"} + + return { + "browser" : browser_name, + "directory": browser_dir, + "keyring" : keyring_name, + "profiles" : browser_name not in browsers_without_profiles + } + + +class ChromeCookieDecryptor: + """ + Overview: + + Linux: + - cookies are either v10 or v11 + - v10: AES-CBC encrypted with a fixed key + - v11: AES-CBC encrypted with an OS protected key (keyring) + - v11 keys can be stored in various places depending on the + activate desktop environment [2] + + Mac: + - cookies are either v10 or not v10 + - v10: AES-CBC encrypted with an OS protected key (keyring) + and more key derivation iterations than linux + - not v10: "old data" stored as plaintext + + Windows: + - cookies are either v10 or not v10 + - v10: AES-GCM encrypted with a key which is encrypted with DPAPI + - not v10: encrypted with DPAPI + + Sources: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads + /main/components/os_crypt/ + - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads + /main/components/os_crypt/key_storage_linux.cc + - KeyStorageLinux::CreateService + """ + + def decrypt(self, encrypted_value): + raise NotImplementedError("Must be implemented by sub classes") + + @property + def cookie_counts(self): + raise NotImplementedError("Must be implemented by sub classes") + + +def get_cookie_decryptor(browser_root, browser_keyring_name, *, keyring=None): + if sys.platform in ("linux", "linux2"): + return LinuxChromeCookieDecryptor( + browser_keyring_name, keyring=keyring) + elif sys.platform == "darwin": + return MacChromeCookieDecryptor(browser_keyring_name) + elif sys.platform == "win32": + return WindowsChromeCookieDecryptor(browser_root) + else: + raise NotImplementedError("Chrome cookie decryption is not supported " + "on {}".format(sys.platform)) + + +class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_keyring_name, *, keyring=None): + self._v10_key = self.derive_key(b"peanuts") + password = _get_linux_keyring_password(browser_keyring_name, keyring) + self._v11_key = None if password is None else self.derive_key(password) + self._cookie_counts = {"v10": 0, "v11": 0, "other": 0} + + @staticmethod + def derive_key(password): + # values from + # https://chromium.googlesource.com/chromium/src/+/refs/heads + # /main/components/os_crypt/os_crypt_linux.cc + return pbkdf2_sha1(password, salt=b"saltysalt", + iterations=1, key_length=16) + + @property + def cookie_counts(self): + return self._cookie_counts + + def decrypt(self, encrypted_value): + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b"v10": + self._cookie_counts["v10"] += 1 + return _decrypt_aes_cbc(ciphertext, self._v10_key) + + elif version == b"v11": + self._cookie_counts["v11"] += 1 + if self._v11_key is None: + logger.warning("cannot decrypt v11 cookies: no key found") + return None + return _decrypt_aes_cbc(ciphertext, self._v11_key) + + else: + self._cookie_counts["other"] += 1 + return None + + +class MacChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_keyring_name): + password = _get_mac_keyring_password(browser_keyring_name) + self._v10_key = None if password is None else self.derive_key(password) + self._cookie_counts = {"v10": 0, "other": 0} + + @staticmethod + def derive_key(password): + # values from + # https://chromium.googlesource.com/chromium/src/+/refs/heads + # /main/components/os_crypt/os_crypt_mac.mm + return pbkdf2_sha1(password, salt=b"saltysalt", + iterations=1003, key_length=16) + + @property + def cookie_counts(self): + return self._cookie_counts + + def decrypt(self, encrypted_value): + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b"v10": + self._cookie_counts["v10"] += 1 + if self._v10_key is None: + logger.warning("cannot decrypt v10 cookies: no key found") + return None + + return _decrypt_aes_cbc(ciphertext, self._v10_key) + + else: + self._cookie_counts["other"] += 1 + # other prefixes are considered "old data", + # which were stored as plaintext + # https://chromium.googlesource.com/chromium/src/+/refs/heads + # /main/components/os_crypt/os_crypt_mac.mm + return encrypted_value + + +class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_root): + self._v10_key = _get_windows_v10_key(browser_root) + self._cookie_counts = {"v10": 0, "other": 0} + + @property + def cookie_counts(self): + return self._cookie_counts + + def decrypt(self, encrypted_value): + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b"v10": + self._cookie_counts["v10"] += 1 + if self._v10_key is None: + logger.warning("cannot decrypt v10 cookies: no key found") + return None + + # https://chromium.googlesource.com/chromium/src/+/refs/heads + # /main/components/os_crypt/os_crypt_win.cc + # kNonceLength + nonce_length = 96 // 8 + # boringssl + # EVP_AEAD_AES_GCM_TAG_LEN + authentication_tag_length = 16 + + raw_ciphertext = ciphertext + nonce = raw_ciphertext[:nonce_length] + ciphertext = raw_ciphertext[ + nonce_length:-authentication_tag_length] + authentication_tag = raw_ciphertext[-authentication_tag_length:] + + return _decrypt_aes_gcm( + ciphertext, self._v10_key, nonce, authentication_tag) + + else: + self._cookie_counts["other"] += 1 + # any other prefix means the data is DPAPI encrypted + # https://chromium.googlesource.com/chromium/src/+/refs/heads + # /main/components/os_crypt/os_crypt_win.cc + return _decrypt_windows_dpapi(encrypted_value).decode() + + +# -------------------------------------------------------------------- +# keyring + +def _choose_linux_keyring(): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads + /main/components/os_crypt/key_storage_util_linux.cc + SelectBackend + """ + desktop_environment = _get_linux_desktop_environment(os.environ) + logger.debug("Detected desktop environment: %s", desktop_environment) + if desktop_environment == DE_KDE: + return KEYRING_KWALLET + if desktop_environment == DE_OTHER: + return KEYRING_BASICTEXT + return KEYRING_GNOMEKEYRING + + +def _get_kwallet_network_wallet(): + """ The name of the wallet used to store network passwords. + + https://chromium.googlesource.com/chromium/src/+/refs/heads + /main/components/os_crypt/kwallet_dbus.cc + KWalletDBus::NetworkWallet + which does a dbus call to the following function: + https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html + Wallet::NetworkWallet + """ + default_wallet = "kdewallet" + try: + proc, stdout = Popen_communicate( + "dbus-send", "--session", "--print-reply=literal", + "--dest=org.kde.kwalletd5", + "/modules/kwalletd5", + "org.kde.KWallet.networkWallet" + ) + + if proc.returncode != 0: + logger.warning("failed to read NetworkWallet") + return default_wallet + else: + network_wallet = stdout.decode().strip() + logger.debug("NetworkWallet = '%s'", network_wallet) + return network_wallet + except Exception as exc: + logger.warning("exception while obtaining NetworkWallet (%s: %s)", + exc.__class__.__name__, exc) + return default_wallet + + +def _get_kwallet_password(browser_keyring_name): + logger.debug("using kwallet-query to obtain password from kwallet") + + if shutil.which("kwallet-query") is None: + logger.error( + "kwallet-query command not found. KWallet and kwallet-query " + "must be installed to read from KWallet. kwallet-query should be " + "included in the kwallet package for your distribution") + return b"" + + network_wallet = _get_kwallet_network_wallet() + + try: + proc, stdout = Popen_communicate( + "kwallet-query", + "--read-password", browser_keyring_name + " Safe Storage", + "--folder", browser_keyring_name + " Keys", + network_wallet, + ) + + if proc.returncode != 0: + logger.error("kwallet-query failed with return code {}. " + "Please consult the kwallet-query man page " + "for details".format(proc.returncode)) + return b"" + + if stdout.lower().startswith(b"failed to read"): + logger.debug("Failed to read password from kwallet. " + "Using empty string instead") + # This sometimes occurs in KDE because chrome does not check + # hasEntry and instead just tries to read the value (which + # kwallet returns "") whereas kwallet-query checks hasEntry. + # To verify this: + # dbus-monitor "interface="org.kde.KWallet"" "type=method_return" + # while starting chrome. + # This may be a bug, as the intended behaviour is to generate a + # random password and store it, but that doesn't matter here. + return b"" + else: + logger.debug("password found") + if stdout[-1:] == b"\n": + stdout = stdout[:-1] + return stdout + except Exception as exc: + logger.warning("exception running kwallet-query (%s: %s)", + exc.__class__.__name__, exc) + return b"" + + +def _get_gnome_keyring_password(browser_keyring_name): + try: + import secretstorage + except ImportError: + logger.error("secretstorage not available") + return b"" + + # Gnome keyring does not seem to organise keys in the same way as KWallet, + # using `dbus-monitor` during startup, it can be observed that chromium + # lists all keys and presumably searches for its key in the list. + # It appears that we must do the same. + # https://github.com/jaraco/keyring/issues/556 + with contextlib.closing(secretstorage.dbus_init()) as con: + col = secretstorage.get_default_collection(con) + label = browser_keyring_name + " Safe Storage" + for item in col.get_all_items(): + if item.get_label() == label: + return item.get_secret() + else: + logger.error("failed to read from keyring") + return b"" + + +def _get_linux_keyring_password(browser_keyring_name, keyring): + # Note: chrome/chromium can be run with the following flags + # to determine which keyring backend it has chosen to use + # - chromium --enable-logging=stderr --v=1 2>&1 | grep key_storage_ + # + # Chromium supports --password-store=<basic|gnome|kwallet> + # so the automatic detection will not be sufficient in all cases. + + if not keyring: + keyring = _choose_linux_keyring() + logger.debug("Chosen keyring: %s", keyring) + + if keyring == KEYRING_KWALLET: + return _get_kwallet_password(browser_keyring_name) + elif keyring == KEYRING_GNOMEKEYRING: + return _get_gnome_keyring_password(browser_keyring_name) + elif keyring == KEYRING_BASICTEXT: + # when basic text is chosen, all cookies are stored as v10 + # so no keyring password is required + return None + assert False, "Unknown keyring " + keyring + + +def _get_mac_keyring_password(browser_keyring_name): + logger.debug("using find-generic-password to obtain " + "password from OSX keychain") + try: + proc, stdout = Popen_communicate( + "security", "find-generic-password", + "-w", # write password to stdout + "-a", browser_keyring_name, # match "account" + "-s", browser_keyring_name + " Safe Storage", # match "service" + ) + + if stdout[-1:] == b"\n": + stdout = stdout[:-1] + return stdout + except Exception as exc: + logger.warning("exception running find-generic-password (%s: %s)", + exc.__class__.__name__, exc) + return None + + +def _get_windows_v10_key(browser_root): + path = _find_most_recently_used_file(browser_root, "Local State") + if path is None: + logger.error("could not find local state file") + return None + logger.debug("Found local state file at '%s'", path) + with open(path, encoding="utf8") as f: + data = json.load(f) + try: + base64_key = data["os_crypt"]["encrypted_key"] + except KeyError: + logger.error("no encrypted key in Local State") + return None + encrypted_key = binascii.a2b_base64(base64_key) + prefix = b"DPAPI" + if not encrypted_key.startswith(prefix): + logger.error("invalid key") + return None + return _decrypt_windows_dpapi(encrypted_key[len(prefix):]) + + +# -------------------------------------------------------------------- +# utility + +class ParserError(Exception): + pass + + +class DataParser: + def __init__(self, data): + self.cursor = 0 + self._data = data + + def read_bytes(self, num_bytes): + if num_bytes < 0: + raise ParserError("invalid read of {} bytes".format(num_bytes)) + end = self.cursor + num_bytes + if end > len(self._data): + raise ParserError("reached end of input") + data = self._data[self.cursor:end] + self.cursor = end + return data + + def expect_bytes(self, expected_value, message): + value = self.read_bytes(len(expected_value)) + if value != expected_value: + raise ParserError("unexpected value: {} != {} ({})".format( + value, expected_value, message)) + + def read_uint(self, big_endian=False): + data_format = ">I" if big_endian else "<I" + return struct.unpack(data_format, self.read_bytes(4))[0] + + def read_double(self, big_endian=False): + data_format = ">d" if big_endian else "<d" + return struct.unpack(data_format, self.read_bytes(8))[0] + + def read_cstring(self): + buffer = [] + while True: + c = self.read_bytes(1) + if c == b"\x00": + return b"".join(buffer).decode() + else: + buffer.append(c) + + def skip(self, num_bytes, description="unknown"): + if num_bytes > 0: + logger.debug("skipping {} bytes ({}): {!r}".format( + num_bytes, description, self.read_bytes(num_bytes))) + elif num_bytes < 0: + raise ParserError("invalid skip of {} bytes".format(num_bytes)) + + def skip_to(self, offset, description="unknown"): + self.skip(offset - self.cursor, description) + + def skip_to_end(self, description="unknown"): + self.skip_to(len(self._data), description) + + +class DatabaseCopy(): + + def __init__(self, path): + self.path = path + self.directory = self.database = None + + def __enter__(self): + try: + self.directory = tempfile.TemporaryDirectory(prefix="gallery-dl-") + path_copy = os.path.join(self.directory.name, "copy.sqlite") + shutil.copyfile(self.path, path_copy) + self.database = db = sqlite3.connect( + path_copy, isolation_level=None, check_same_thread=False) + return db + except BaseException: + if self.directory: + self.directory.cleanup() + raise + + def __exit__(self, exc, value, tb): + self.database.close() + self.directory.cleanup() + + +def Popen_communicate(*args): + proc = subprocess.Popen( + args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + try: + stdout, stderr = proc.communicate() + except BaseException: # Including KeyboardInterrupt + proc.kill() + proc.wait() + raise + return proc, stdout + + +""" +https://chromium.googlesource.com/chromium/src/+/refs/heads +/main/base/nix/xdg_util.h - DesktopEnvironment +""" +DE_OTHER = "other" +DE_CINNAMON = "cinnamon" +DE_GNOME = "gnome" +DE_KDE = "kde" +DE_PANTHEON = "pantheon" +DE_UNITY = "unity" +DE_XFCE = "xfce" + + +""" +https://chromium.googlesource.com/chromium/src/+/refs/heads +/main/components/os_crypt/key_storage_util_linux.h - SelectedLinuxBackend +""" +KEYRING_KWALLET = "kwallet" +KEYRING_GNOMEKEYRING = "gnomekeyring" +KEYRING_BASICTEXT = "basictext" +SUPPORTED_KEYRINGS = {"kwallet", "gnomekeyring", "basictext"} + + +def _get_linux_desktop_environment(env): + """ + Ref: https://chromium.googlesource.com/chromium/src/+/refs/heads + /main/base/nix/xdg_util.cc - GetDesktopEnvironment + """ + xdg_current_desktop = env.get("XDG_CURRENT_DESKTOP") + desktop_session = env.get("DESKTOP_SESSION") + + if xdg_current_desktop: + xdg_current_desktop = (xdg_current_desktop.partition(":")[0] + .strip().lower()) + + if xdg_current_desktop == "unity": + if desktop_session and "gnome-fallback" in desktop_session: + return DE_GNOME + else: + return DE_UNITY + elif xdg_current_desktop == "gnome": + return DE_GNOME + elif xdg_current_desktop == "x-cinnamon": + return DE_CINNAMON + elif xdg_current_desktop == "kde": + return DE_KDE + elif xdg_current_desktop == "pantheon": + return DE_PANTHEON + elif xdg_current_desktop == "xfce": + return DE_XFCE + + if desktop_session: + if desktop_session in ("mate", "gnome"): + return DE_GNOME + if "kde" in desktop_session: + return DE_KDE + if "xfce" in desktop_session: + return DE_XFCE + + if "GNOME_DESKTOP_SESSION_ID" in env: + return DE_GNOME + if "KDE_FULL_SESSION" in env: + return DE_KDE + return DE_OTHER + + +def _mac_absolute_time_to_posix(timestamp): + return int((datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc) + + timedelta(seconds=timestamp)).timestamp()) + + +def pbkdf2_sha1(password, salt, iterations, key_length): + return pbkdf2_hmac("sha1", password, salt, iterations, key_length) + + +def _decrypt_aes_cbc(ciphertext, key, initialization_vector=b" " * 16): + plaintext = aes.unpad_pkcs7( + aes.aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) + try: + return plaintext.decode() + except UnicodeDecodeError: + logger.warning("failed to decrypt cookie (AES-CBC) because UTF-8 " + "decoding failed. Possibly the key is wrong?") + return None + + +def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag): + try: + plaintext = aes.aes_gcm_decrypt_and_verify_bytes( + ciphertext, key, authentication_tag, nonce) + except ValueError: + logger.warning("failed to decrypt cookie (AES-GCM) because MAC check " + "failed. Possibly the key is wrong?") + return None + + try: + return plaintext.decode() + except UnicodeDecodeError: + logger.warning("failed to decrypt cookie (AES-GCM) because UTF-8 " + "decoding failed. Possibly the key is wrong?") + return None + + +def _decrypt_windows_dpapi(ciphertext): + """ + References: + - https://docs.microsoft.com/en-us/windows + /win32/api/dpapi/nf-dpapi-cryptunprotectdata + """ + from ctypes.wintypes import DWORD + + class DATA_BLOB(ctypes.Structure): + _fields_ = [("cbData", DWORD), + ("pbData", ctypes.POINTER(ctypes.c_char))] + + buffer = ctypes.create_string_buffer(ciphertext) + blob_in = DATA_BLOB(ctypes.sizeof(buffer), buffer) + blob_out = DATA_BLOB() + ret = ctypes.windll.crypt32.CryptUnprotectData( + ctypes.byref(blob_in), # pDataIn + None, # ppszDataDescr: human readable description of pDataIn + None, # pOptionalEntropy: salt? + None, # pvReserved: must be NULL + None, # pPromptStruct: information about prompts to display + 0, # dwFlags + ctypes.byref(blob_out) # pDataOut + ) + if not ret: + logger.warning("failed to decrypt with DPAPI") + return None + + result = ctypes.string_at(blob_out.pbData, blob_out.cbData) + ctypes.windll.kernel32.LocalFree(blob_out.pbData) + return result + + +def _find_most_recently_used_file(root, filename): + # if there are multiple browser profiles, take the most recently used one + paths = [] + for curr_root, dirs, files in os.walk(root): + for file in files: + if file == filename: + paths.append(os.path.join(curr_root, file)) + if not paths: + return None + return max(paths, key=lambda path: os.lstat(path).st_mtime) + + +def _is_path(value): + return os.path.sep in value + + +def _parse_browser_specification(browser, profile=None, keyring=None): + if browser not in SUPPORTED_BROWSERS: + raise ValueError("unsupported browser '{}'".format(browser)) + if keyring and keyring not in SUPPORTED_KEYRINGS: + raise ValueError("unsupported keyring '{}'".format(keyring)) + if profile and _is_path(profile): + profile = os.path.expanduser(profile) + return browser, profile, keyring diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 5675081..e686c70 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,6 +20,7 @@ class ArtstationExtractor(Extractor): filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}" directory_fmt = ("{category}", "{userinfo[username]}") archive_fmt = "{asset[id]}" + browser = "firefox" root = "https://www.artstation.com" def __init__(self, match): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index abb352c..cac8c2d 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -306,23 +306,29 @@ class Extractor(): cookiefile = util.expand_path(cookies) try: with open(cookiefile) as fp: - cookies = util.load_cookiestxt(fp) + util.cookiestxt_load(fp, self._cookiejar) except Exception as exc: self.log.warning("cookies: %s", exc) else: - self._update_cookies(cookies) self._cookiefile = cookiefile + elif isinstance(cookies, (list, tuple)): + from ..cookies import load_cookies + try: + load_cookies(self._cookiejar, cookies) + except Exception as exc: + self.log.warning("cookies: %s", exc) else: self.log.warning( - "expected 'dict' or 'str' value for 'cookies' option, " - "got '%s' (%s)", cookies.__class__.__name__, cookies) + "Expected 'dict', 'list', or 'str' value for 'cookies' " + "option, got '%s' (%s)", + cookies.__class__.__name__, cookies) def _store_cookies(self): """Store the session's cookiejar in a cookies.txt file""" if self._cookiefile and self.config("cookies-update", True): try: with open(self._cookiefile, "w") as fp: - util.save_cookiestxt(fp, self._cookiejar) + util.cookiestxt_store(fp, self._cookiejar) except OSError as exc: self.log.warning("cookies: %s", exc) diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 04e5926..093113d 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for 4chan archives based on FoolFuuka""" +"""Extractors for FoolFuuka 4chan archives""" from .common import BaseExtractor, Message from .. import text @@ -16,6 +16,7 @@ import itertools class FoolfuukaExtractor(BaseExtractor): """Base extractor for FoolFuuka based boards/archives""" basecategory = "foolfuuka" + filename_fmt = "{timestamp_ms} {filename_media}.{extension}" archive_fmt = "{board[shortname]}_{num}_{timestamp}" external = "default" @@ -40,6 +41,9 @@ class FoolfuukaExtractor(BaseExtractor): post["filename"], _, post["extension"] = \ media["media"].rpartition(".") + post["filename_media"] = media["media_filename"].rpartition(".")[0] + post["timestamp_ms"] = text.parse_int( + media["media_orig"].rpartition(".")[0]) yield Message.Url, url, post def metadata(self): @@ -66,6 +70,7 @@ BASE_PATTERN = FoolfuukaExtractor.update({ }, "archivedmoe": { "root": "https://archived.moe", + "pattern": r"archived\.moe", }, "archiveofsins": { "root": "https://archiveofsins.com", @@ -73,12 +78,15 @@ BASE_PATTERN = FoolfuukaExtractor.update({ }, "b4k": { "root": "https://arch.b4k.co", + "pattern": r"arch\.b4k\.co", }, "desuarchive": { "root": "https://desuarchive.org", + "pattern": r"desuarchive\.org", }, "fireden": { "root": "https://boards.fireden.net", + "pattern": r"boards\.fireden\.net", }, "nyafuu": { "root": "https://archive.nyafuu.org", @@ -90,9 +98,11 @@ BASE_PATTERN = FoolfuukaExtractor.update({ }, "thebarchive": { "root": "https://thebarchive.com", + "pattern": r"thebarchive\.com", }, "wakarimasen": { "root": "https://archive.wakarimasen.moe", + "pattern": r"archive\.wakarimasen\.moe", }, }) @@ -101,7 +111,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): """Base extractor for threads on FoolFuuka based boards/archives""" subcategory = "thread" directory_fmt = ("{category}", "{board[shortname]}", - "{thread_num}{title:? - //}") + "{thread_num} {title|comment[:50]}") pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)" test = ( ("https://archive.4plebs.org/tg/thread/54059290", { diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index c09eb96..382cc25 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -41,6 +41,7 @@ class FoolslideExtractor(BaseExtractor): BASE_PATTERN = FoolslideExtractor.update({ "kireicake": { "root": "https://reader.kireicake.com", + "pattern": r"reader\.kireicake\.com", }, "powermanga": { "root": "https://read.powermanga.org", diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 541f454..9c19664 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Gelbooru v0.1 sites""" +"""Extractors for Gelbooru Beta 0.1.11 sites""" from . import booru from .. import text @@ -42,14 +42,43 @@ class GelbooruV01Extractor(booru.BooruExtractor): return post + def _pagination(self, url, begin, end): + pid = self.page_start + + while True: + page = self.request(url + str(pid)).text + + cnt = 0 + for post_id in text.extract_iter(page, begin, end): + yield self._parse_post(post_id) + cnt += 1 + + if cnt < self.per_page: + return + pid += self.per_page + BASE_PATTERN = GelbooruV01Extractor.update({ - "thecollection" : {"root": "https://the-collection.booru.org"}, - "illusioncardsbooru": {"root": "https://illusioncards.booru.org"}, - "allgirlbooru" : {"root": "https://allgirl.booru.org"}, - "drawfriends" : {"root": "https://drawfriends.booru.org"}, - "vidyart" : {"root": "https://vidyart.booru.org"}, - "theloudbooru" : {"root": "https://tlb.booru.org"}, + "thecollection": { + "root": "https://the-collection.booru.org", + "pattern": r"the-collection\.booru\.org", + }, + "illusioncardsbooru": { + "root": "https://illusioncards.booru.org", + "pattern": r"illusioncards\.booru\.org", + }, + "allgirlbooru": { + "root": "https://allgirl.booru.org", + "pattern": r"allgirl\.booru\.org", + }, + "drawfriends": { + "root": "https://drawfriends.booru.org", + "pattern": r"drawfriends\.booru\.org", + }, + "vidyart": { + "root": "https://vidyart.booru.org", + "pattern": r"vidyart\.booru\.org", + }, }) @@ -75,7 +104,6 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): }), ("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"), ("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"), - ("https://tlb.booru.org/index.php?page=post&s=list&tags=all"), ) def __init__(self, match): @@ -88,20 +116,42 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): def posts(self): url = "{}/index.php?page=post&s=list&tags={}&pid=".format( self.root, self.tags) - pid = self.page_start + return self._pagination(url, 'class="thumb"><a id="p', '"') - while True: - page = self.request(url + str(pid)).text - cnt = 0 - for post_id in text.extract_iter( - page, 'class="thumb"><a id="p', '"'): - yield self._parse_post(post_id) - cnt += 1 +class GelbooruV01FavoriteExtractor(GelbooruV01Extractor): + subcategory = "favorite" + directory_fmt = ("{category}", "favorites", "{favorite_id}") + archive_fmt = "f_{favorite_id}_{id}" + per_page = 50 + pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" + test = ( + (("https://the-collection.booru.org" + "/index.php?page=favorites&s=view&id=1166"), { + "count": 2, + }), + (("https://illusioncards.booru.org" + "/index.php?page=favorites&s=view&id=84887"), { + "count": 2, + }), + ("https://allgirl.booru.org/index.php?page=favorites&s=view&id=380", { + "count": 4, + }), + ("https://drawfriends.booru.org/index.php?page=favorites&s=view&id=1"), + ("https://vidyart.booru.org/index.php?page=favorites&s=view&id=1"), + ) - if cnt < self.per_page: - return - pid += self.per_page + def __init__(self, match): + GelbooruV01Extractor.__init__(self, match) + self.favorite_id = match.group(match.lastindex) + + def metadata(self): + return {"favorite_id": text.parse_int(self.favorite_id)} + + def posts(self): + url = "{}/index.php?page=favorites&s=view&id={}&pid=".format( + self.root, self.favorite_id) + return self._pagination(url, "posts[", "]") class GelbooruV01PostExtractor(GelbooruV01Extractor): @@ -141,7 +191,6 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor): }), ("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"), ("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"), - ("https://tlb.booru.org/index.php?page=post&s=view&id=127223"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 7e16a51..2dd0c0c 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Gelbooru v0.2 sites""" +"""Extractors for Gelbooru Beta 0.2 sites""" from . import booru from .. import text, util, exception @@ -26,6 +26,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): except KeyError: self.api_root = self.root + if self.category == "realbooru": + self._file_url = self._file_url_realbooru + def _api_request(self, params): url = self.api_root + "/index.php?page=dapi&s=post&q=index" return ElementTree.fromstring(self.request(url, params=params).text) @@ -61,6 +64,14 @@ class GelbooruV02Extractor(booru.BooruExtractor): post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") + def _file_url_realbooru(self, post): + url = post["file_url"] + if url.count("/") == 5: + md5 = post["md5"] + url = "{}/images/{}/{}/{}.{}".format( + self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) + return url + def _extended_tags(self, post, page=None): if not page: url = "{}/index.php?page=post&s=view&id={}".format( @@ -105,11 +116,23 @@ class GelbooruV02Extractor(booru.BooruExtractor): INSTANCES = { - "realbooru": {"root": "https://realbooru.com"}, - "rule34" : {"root": "https://rule34.xxx", - "api_root": " https://api.rule34.xxx"}, - "safebooru": {"root": "https://safebooru.org"}, - "tbib" : {"root": "https://tbib.org"}, + "realbooru": { + "root": "https://realbooru.com", + "pattern": r"realbooru\.com", + }, + "rule34": { + "root": "https://rule34.xxx", + "pattern": r"rule34\.xxx", + "api_root": "https://api.rule34.xxx", + }, + "safebooru": { + "root": "https://safebooru.org", + "pattern": r"safebooru\.org", + }, + "tbib": { + "root": "https://tbib.org", + "pattern": r"tbib\.org", + }, } BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES) @@ -147,7 +170,7 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): return {"search_tags": self.tags} def posts(self): - return self._pagination({"tags" : self.tags}) + return self._pagination({"tags": self.tags}) class GelbooruV02PoolExtractor(GelbooruV02Extractor): @@ -213,7 +236,7 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): "count": 2, }), ("https://realbooru.com/index.php?page=favorites&s=view&id=274", { - "count": 4, + "count": 2, }), ("https://tbib.org/index.php?page=favorites&s=view&id=7881", { "count": 3, @@ -279,7 +302,8 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): }, }), ("https://realbooru.com/index.php?page=post&s=view&id=668483", { - "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9", + "pattern": r"https://realbooru\.com/images/dc/b5" + r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg", "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", }), ("https://tbib.org/index.php?page=post&s=view&id=9233957", { diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 2035655..fd78ce2 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -201,17 +201,24 @@ class ImgurAlbumExtractor(ImgurExtractor): ("https://imgur.com/a/TcBmQ", { "exception": exception.HttpError, }), + ("https://imgur.com/a/pjOnJA0", { # empty, no 'media' (#2557) + "count": 0, + }), ("https://www.imgur.com/a/TcBmP"), # www ("https://m.imgur.com/a/TcBmP"), # mobile ) def items(self): album = self.api.album(self.key) - album["date"] = text.parse_datetime(album["created_at"]) - images = album["media"] + try: + images = album["media"] + except KeyError: + return + del album["media"] count = len(images) + album["date"] = text.parse_datetime(album["created_at"]) try: del album["ad_url"] diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index e07b64e..82c9858 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -80,12 +80,22 @@ class InstagramExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and "/accounts/login/" in response.request.url: - if self._cursor: - self.log.info("Use '-o cursor=%s' to continue downloading " - "from the current position", self._cursor) - raise exception.StopExtraction( - "HTTP redirect to login page (%s)", response.request.url) + if response.history: + + url = response.request.url + if "/accounts/login/" in url: + page = "login" + elif "/challenge/" in url: + page = "challenge" + else: + page = None + + if page: + if self._cursor: + self.log.info("Use '-o cursor=%s' to continue downloading " + "from the current position", self._cursor) + raise exception.StopExtraction("HTTP redirect to %s page (%s)", + page, url.partition("?")[0]) www_claim = response.headers.get("x-ig-set-www-claim") if www_claim is not None: @@ -298,7 +308,7 @@ class InstagramExtractor(Extractor): video = None media = image - files.append({ + media = { "num" : num, "date" : text.parse_timestamp(item.get("taken_at") or media.get("taken_at")), @@ -309,7 +319,9 @@ class InstagramExtractor(Extractor): "video_url" : video["url"] if video else None, "width" : media["width"], "height" : media["height"], - }) + } + self._extract_tagged_users(item, media) + files.append(media) return data @@ -321,22 +333,45 @@ class InstagramExtractor(Extractor): "abcdefghijklmnopqrstuvwxyz" "0123456789-_") - def _extract_tagged_users(self, src, dest): - if "edge_media_to_tagged_user" not in src: - return - edges = src["edge_media_to_tagged_user"]["edges"] + @staticmethod + def _extract_tagged_users(src, dest): + dest["tagged_users"] = tagged_users = [] + + edges = src.get("edge_media_to_tagged_user") if edges: - dest["tagged_users"] = tagged_users = [] - for edge in edges: + for edge in edges["edges"]: user = edge["node"]["user"] - tagged_users.append({ - "id" : user["id"], - "username" : user["username"], - "full_name": user["full_name"], - }) - - def _extract_shared_data(self, url): - page = self.request(url).text + tagged_users.append({"id" : user["id"], + "username" : user["username"], + "full_name": user["full_name"]}) + + usertags = src.get("usertags") + if usertags: + for tag in usertags["in"]: + user = tag["user"] + tagged_users.append({"id" : user["pk"], + "username" : user["username"], + "full_name": user["full_name"]}) + + mentions = src.get("reel_mentions") + if mentions: + for mention in mentions: + user = mention["user"] + tagged_users.append({"id" : user.get("pk"), + "username" : user["username"], + "full_name": user["full_name"]}) + + stickers = src.get("story_bloks_stickers") + if stickers: + for sticker in stickers: + sticker = sticker["bloks_sticker"] + if sticker["bloks_sticker_type"] == "mention": + user = sticker["sticker_data"]["ig_mention"] + tagged_users.append({"id" : user["account_id"], + "username" : user["username"], + "full_name": user["full_name"]}) + + def _extract_shared_data(self, page): shared_data, pos = text.extract( page, "window._sharedData =", ";</script>") additional_data, pos = text.extract( @@ -349,13 +384,15 @@ class InstagramExtractor(Extractor): return data def _extract_profile_page(self, url): - data = self._extract_shared_data(url)["entry_data"] + page = self.request(url).text + data = self._extract_shared_data(page)["entry_data"] if "HttpErrorPage" in data: raise exception.NotFoundError("user") return data["ProfilePage"][0]["graphql"]["user"] def _extract_post_page(self, url): - data = self._extract_shared_data(url)["entry_data"] + page = self.request(url).text + data = self._extract_shared_data(page)["entry_data"] if "HttpErrorPage" in data: raise exception.NotFoundError("post") return data["PostPage"][0] @@ -524,7 +561,8 @@ class InstagramTagExtractor(InstagramExtractor): def posts(self): url = "{}/explore/tags/{}/".format(self.root, self.item) - page = self._extract_shared_data(url)["entry_data"]["TagPage"][0] + page = self._extract_shared_data( + self.request(url).text)["entry_data"]["TagPage"][0] if "data" in page: return self._pagination_sections(page["data"]["recent"]) @@ -718,8 +756,12 @@ class InstagramStoriesExtractor(InstagramExtractor): reel_id = "highlight:" + self.highlight_id else: url = "{}/stories/{}/".format(self.root, self.user) + with self.request(url, allow_redirects=False) as response: + if 300 <= response.status_code < 400: + return () + page = response.text try: - data = self._extract_shared_data(url)["entry_data"] + data = self._extract_shared_data(page)["entry_data"] user = data["StoriesPage"][0]["user"] except KeyError: return () diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index 67a1a95..e7827b1 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,7 +26,18 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): "pattern": r"https?://vgm(site|downloads).com" r"/soundtracks/horizon-riders-wii/[^/]+" r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3", - "keyword": "12ca70e0709ea15250e577ea388cf2b5b0c65630", + "keyword": { + "album": { + "count": 1, + "date": "Sep 18th, 2016", + "name": "Horizon Riders (Wii)", + "size": 26214400, + "type": "Gamerip", + }, + "extension": "mp3", + "filename": "Horizon Riders Wii - Full Soundtrack", + }, + "count": 1, }) def __init__(self, match): @@ -48,10 +59,10 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): def metadata(self, page): extr = text.extract_from(page) return {"album": { - "name" : text.unescape(extr("Album name: <b>", "<")), + "name" : text.unescape(extr("<h2>", "<")), "count": text.parse_int(extr("Number of Files: <b>", "<")), "size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]), - "date" : extr("Date added: <b>", "<"), + "date" : extr("Date Added: <b>", "<"), "type" : extr("Album type: <b>", "<"), }} diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index ad7cd1d..b6a508d 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -63,6 +63,12 @@ class LolisafeAlbumExtractor(LolisafeExtractor): LolisafeExtractor.__init__(self, match) self.album_id = match.group(match.lastindex) + domain = self.config("domain") + if domain is None or domain == "auto": + self.root = text.root_from_url(match.group(0)) + else: + self.root = text.ensure_http_scheme(domain) + def items(self): files, data = self.fetch_album(self.album_id) diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index f6514ca..4808105 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2021 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -34,6 +34,7 @@ class MangafoxChapterExtractor(ChapterExtractor): base, self.cstr, self.volume, self.chapter, self.minor = match.groups() self.urlbase = self.root + base ChapterExtractor.__init__(self, match, self.urlbase + "/1.html") + self.session.headers["Referer"] = self.root + "/" def metadata(self, page): manga, pos = text.extract(page, "<title>", "</title>") diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index f655f94..461c92d 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -17,8 +17,8 @@ class MangahereBase(): """Base class for mangahere extractors""" category = "mangahere" root = "https://www.mangahere.cc" - mobile_root = "https://m.mangahere.cc" - url_fmt = mobile_root + "/manga/{}/{}.html" + root_mobile = "https://m.mangahere.cc" + url_fmt = root_mobile + "/manga/{}/{}.html" class MangahereChapterExtractor(MangahereBase, ChapterExtractor): @@ -42,6 +42,7 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor): self.part, self.volume, self.chapter = match.groups() url = self.url_fmt.format(self.part, 1) ChapterExtractor.__init__(self, match, url) + self.session.headers["Referer"] = self.root_mobile + "/" def metadata(self, page): pos = page.index("</select>") diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py index 0b0da65..2bd11ef 100644 --- a/gallery_dl/extractor/mangasee.py +++ b/gallery_dl/extractor/mangasee.py @@ -9,7 +9,7 @@ """Extractors for https://mangasee123.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text +from .. import text, util import json @@ -57,6 +57,15 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): }, }) + def __init__(self, match): + ChapterExtractor.__init__(self, match) + self.session.headers["Referer"] = self.gallery_url + + domain = "mangasee123.com" + cookies = self.session.cookies + if not cookies.get("PHPSESSID", domain=domain): + cookies.set("PHPSESSID", util.generate_token(13), domain=domain) + def metadata(self, page): extr = text.extract_from(page) self.chapter = data = json.loads(extr("vm.CurChapter =", ";\r\n")) diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index cd7cabb..6e780e8 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -60,12 +60,14 @@ class MastodonExtractor(BaseExtractor): INSTANCES = { "mastodon.social": { "root" : "https://mastodon.social", + "pattern" : r"mastodon\.social", "access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48", "client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo", "client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI", }, "pawoo": { "root" : "https://pawoo.net", + "pattern" : r"pawoo\.net", "access-token" : "c12c9d275050bce0dc92169a28db09d7" "0d62d0a75a8525953098c167eacd3668", "client-id" : "978a25f843ec01e53d09be2c290cd75c" @@ -75,6 +77,7 @@ INSTANCES = { }, "baraag": { "root" : "https://baraag.net", + "pattern" : r"baraag\.net", "access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0", "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o", "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY", diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 604d65c..65b9a83 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2021 Mike Fährmann +# Copyright 2020-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -54,6 +54,7 @@ class MoebooruExtractor(BooruExtractor): BASE_PATTERN = MoebooruExtractor.update({ "yandere": { "root": "https://yande.re", + "pattern": r"yande\.re", }, "konachan": { "root": "https://konachan.com", @@ -61,6 +62,7 @@ BASE_PATTERN = MoebooruExtractor.update({ }, "hypnohub": { "root": "https://hypnohub.net", + "pattern": r"hypnohub\.net", }, "sakugabooru": { "root": "https://www.sakugabooru.com", @@ -68,6 +70,7 @@ BASE_PATTERN = MoebooruExtractor.update({ }, "lolibooru": { "root": "https://lolibooru.moe", + "pattern": r"lolibooru\.moe", }, }) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index 348f6a1..eadd460 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2021 Seonghyeon Cho +# Copyright 2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,8 +11,10 @@ from .common import GalleryExtractor, Extractor, Message from .. import text +import re -BASE_PATTERN = r"(?:https?://)?comic\.naver\.com/webtoon" +BASE_PATTERN = (r"(?:https?://)?comic\.naver\.com" + r"/(webtoon|challenge|bestChallenge)") class NaverwebtoonBase(): @@ -25,19 +28,33 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): directory_fmt = ("{category}", "{comic}") filename_fmt = "{episode:>03}-{num:>02}.{extension}" archive_fmt = "{title_id}_{episode}_{num}" - pattern = BASE_PATTERN + r"/detail\.nhn\?([^#]+)" + pattern = BASE_PATTERN + r"/detail(?:\.nhn)?\?([^#]+)" test = ( - (("https://comic.naver.com/webtoon/detail.nhn?" - "titleId=26458&no=1&weekday=tue"), { + (("https://comic.naver.com/webtoon/detail" + "?titleId=26458&no=1&weekday=tue"), { "url": "47a956ba8c7a837213d5985f50c569fcff986f75", "content": "3806b6e8befbb1920048de9888dfce6220f69a60", "count": 14 }), + (("https://comic.naver.com/challenge/detail" + "?titleId=765124&no=1"), { + "pattern": r"https://image-comic\.pstatic\.net/nas" + r"/user_contents_data/challenge_comic/2021/01/19" + r"/342586/upload_7149856273586337846\.jpeg", + "count": 1, + }), + (("https://comic.naver.com/bestChallenge/detail.nhn" + "?titleId=771467&no=3"), { + "pattern": r"https://image-comic\.pstatic\.net/nas" + r"/user_contents_data/challenge_comic/2021/04/28" + r"/345534/upload_3617293622396203109\.jpeg", + "count": 1, + }), ) def __init__(self, match): - query = match.group(1) - url = "{}/webtoon/detail.nhn?{}".format(self.root, query) + path, query = match.groups() + url = "{}/{}/detail?{}".format(self.root, path, query) GalleryExtractor.__init__(self, match, url) query = text.parse_query(query) @@ -70,22 +87,31 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor): subcategory = "comic" categorytransfer = True - pattern = (BASE_PATTERN + r"/list\.nhn\?([^#]+)") + pattern = (BASE_PATTERN + r"/list(?:\.nhn)?\?([^#]+)") test = ( - ("https://comic.naver.com/webtoon/list.nhn?titleId=22073", { + ("https://comic.naver.com/webtoon/list?titleId=22073", { "pattern": NaverwebtoonEpisodeExtractor.pattern, "count": 32, }), + ("https://comic.naver.com/challenge/list?titleId=765124", { + "pattern": NaverwebtoonEpisodeExtractor.pattern, + "count": 25, + }), + ("https://comic.naver.com/bestChallenge/list.nhn?titleId=789786", { + "pattern": NaverwebtoonEpisodeExtractor.pattern, + "count": ">= 12", + }), ) def __init__(self, match): Extractor.__init__(self, match) - query = text.parse_query(match.group(1)) + self.path, query = match.groups() + query = text.parse_query(query) self.title_id = query.get("titleId") self.page_no = text.parse_int(query.get("page"), 1) def items(self): - url = self.root + "/webtoon/list.nhn" + url = "{}/{}/list".format(self.root, self.path) params = {"titleId": self.title_id, "page": self.page_no} data = {"_extractor": NaverwebtoonEpisodeExtractor} @@ -103,7 +129,8 @@ class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor): def get_episode_urls(self, page): """Extract and return all episode urls in page""" return [ - self.root + "/webtoon/detail.nhn?" + query - for query in text.extract_iter( - page, '<a href="/webtoon/detail?', '"') + self.root + path + for path in re.findall( + r'<a href="(/(?:webtoon|challenge|bestChallenge)' + r'/detail\?[^"]+)', page) ][::2] diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 90ca01d..832831f 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -91,6 +91,10 @@ class NijieExtractor(AsynchronousMixin, Extractor): "url": url, }) + @staticmethod + def _extract_user_name(page): + return text.unescape(text.extract(page, "<br />", "<")[0] or "") + def login(self): """Login and obtain session cookies""" if not self._check_cookies(self.cookienames): @@ -119,9 +123,8 @@ class NijieExtractor(AsynchronousMixin, Extractor): while True: page = self.request(url, params=params, notfound="artist").text - if not self.user_name: - self.user_name = text.unescape(text.extract( - page, '<br />', '<')[0] or "") + if self.user_name is None: + self.user_name = self._extract_user_name(page) yield from text.extract_iter(page, 'illust_id="', '"') if '<a rel="next"' not in page: @@ -137,11 +140,12 @@ class NijieUserExtractor(NijieExtractor): test = ("https://nijie.info/members.php?id=44",) def items(self): - base = "{}/{{}}.php?id={}".format(self.root, self.user_id) + fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format return self._dispatch_extractors(( - (NijieIllustrationExtractor, base.format("members_illust")), - (NijieDoujinExtractor , base.format("members_dojin")), - (NijieFavoriteExtractor , base.format("user_like_illust_view")), + (NijieIllustrationExtractor, fmt("members_illust")), + (NijieDoujinExtractor , fmt("members_dojin")), + (NijieFavoriteExtractor , fmt("user_like_illust_view")), + (NijieNuitaExtractor , fmt("history_nuita")), ), ("illustration", "doujin")) @@ -217,6 +221,36 @@ class NijieFavoriteExtractor(NijieExtractor): return data +class NijieNuitaExtractor(NijieExtractor): + """Extractor for a nijie user's 抜いた list""" + subcategory = "nuita" + directory_fmt = ("{category}", "nuita", "{user_id}") + archive_fmt = "n_{user_id}_{image_id}_{num}" + pattern = BASE_PATTERN + r"/history_nuita\.php\?id=(\d+)" + test = ("https://nijie.info/history_nuita.php?id=728995", { + "range": "1-10", + "count": 10, + "keyword": { + "user_id" : 728995, + "user_name": "莚", + }, + }) + + def image_ids(self): + return self._pagination("history_nuita") + + def _extract_data(self, page): + data = NijieExtractor._extract_data(page) + data["user_id"] = self.user_id + data["user_name"] = self.user_name + return data + + @staticmethod + def _extract_user_name(page): + return text.unescape(text.extract( + page, "<title>", "さんの抜いた")[0] or "") + + class NijieImageExtractor(NijieExtractor): """Extractor for a work/image from nijie.info""" subcategory = "image" diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 428f772..653822f 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from . import deviantart, flickr, mastodon, pixiv, reddit, smugmug, tumblr from .. import text, oauth, util, config, exception +from ..output import stdout_write from ..cache import cache import urllib.parse import hashlib @@ -37,7 +38,7 @@ class OAuthBase(Extractor): def recv(self): """Open local HTTP server and recv callback parameters""" import socket - print("Waiting for response. (Cancel with Ctrl+c)") + stdout_write("Waiting for response. (Cancel with Ctrl+c)\n") server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server.bind(("localhost", self.config("port", 6414))) @@ -60,7 +61,7 @@ class OAuthBase(Extractor): def send(self, msg): """Send 'msg' to the socket opened in 'recv()'""" - print(msg) + stdout_write(msg) self.client.send(b"HTTP/1.1 200 OK\r\n\r\n" + msg.encode()) self.client.close() @@ -69,12 +70,13 @@ class OAuthBase(Extractor): import webbrowser url += "?" + urllib.parse.urlencode(params) if not self.config("browser", True) or not webbrowser.open(url): - print("Please open this URL in your browser:") - print(url, end="\n\n", flush=True) + stdout_write( + "Please open this URL in your browser:\n\n" + url + "\n\n") return (recv or self.recv)() def error(self, msg): - return self.send("Remote server reported an error:\n\n" + str(msg)) + return self.send( + "Remote server reported an error:\n\n{}\n".format(msg)) def _oauth1_authorization_flow( self, request_token_url, authorize_url, access_token_url): @@ -133,7 +135,7 @@ class OAuthBase(Extractor): # check authorization response if state != params.get("state"): - self.send("'state' mismatch: expected {}, got {}.".format( + self.send("'state' mismatch: expected {}, got {}.\n".format( state, params.get("state") )) return @@ -188,7 +190,7 @@ class OAuthBase(Extractor): opt = self.oauth_config(names[0]) if self.cache and (opt is None or opt == "cache"): - msg += _vh + " been cached and will automatically be used." + msg += _vh + " been cached and will automatically be used.\n" else: msg += "Put " + _va + " into your configuration file as \n" msg += " and\n".join( @@ -200,7 +202,7 @@ class OAuthBase(Extractor): "\nor set\n'extractor.{}.{}' to \"cache\"" .format(self.subcategory, names[0]) ) - msg += "\nto use {}.".format(_it) + msg += "\nto use {}.\n".format(_it) return msg @@ -398,9 +400,9 @@ class OAuthPixiv(OAuthBase): data = self.session.post(url, headers=headers, data=data).json() if "error" in data: - print(data) + stdout_write("\n{}\n".format(data)) if data["error"] in ("invalid_request", "invalid_grant"): - print("'code' expired, try again") + stdout_write("'code' expired, try again\n\n") return token = data["refresh_token"] @@ -409,10 +411,10 @@ class OAuthPixiv(OAuthBase): pixiv._refresh_token_cache.update(username, token) self.log.info("Writing 'refresh-token' to cache") - print(self._generate_message(("refresh-token",), (token,))) + stdout_write(self._generate_message(("refresh-token",), (token,))) def _input(self): - print(""" + stdout_write("""\ 1) Open your browser's Developer Tools (F12) and switch to the Network tab 2) Login 3) Select the last network monitor entry ('callback?state=...') @@ -421,6 +423,7 @@ class OAuthPixiv(OAuthBase): - This 'code' will expire 30 seconds after logging in. - Copy-pasting more than just the 'code' value will work as well, like the entire URL or several query parameters. + """) code = input("code: ") return code.rpartition("=")[2].strip() diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 92b8113..951b34d 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -58,12 +58,21 @@ class PhilomenaExtractor(BooruExtractor): INSTANCES = { - "derpibooru": {"root": "https://derpibooru.org", - "filter_id": "56027"}, - "ponybooru" : {"root": "https://ponybooru.org", - "filter_id": "2"}, - "furbooru" : {"root": "https://furbooru.org", - "filter_id": "2"}, + "derpibooru": { + "root": "https://derpibooru.org", + "pattern": r"derpibooru\.org", + "filter_id": "56027", + }, + "ponybooru": { + "root": "https://ponybooru.org", + "pattern": r"ponybooru\.org", + "filter_id": "2", + }, + "furbooru": { + "root": "https://furbooru.org", + "pattern": r"furbooru\.org", + "filter_id": "2", + }, } BASE_PATTERN = PhilomenaExtractor.update(INSTANCES) @@ -239,5 +248,5 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor): def posts(self): gallery_id = "gallery_id:" + self.gallery_id url = self.root + "/api/v1/json/search/images" - params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id} + params = {"sd": "desc", "sf": gallery_id, "q": gallery_id} return self._pagination(url, params) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a33df42..9b35e42 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2021 Mike Fährmann +# Copyright 2014-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,16 +10,16 @@ from .common import Extractor, Message from .. import text, util, exception -from ..cache import cache +from ..cache import cache, memcache from datetime import datetime, timedelta import itertools import hashlib -import time class PixivExtractor(Extractor): """Base class for pixiv extractors""" category = "pixiv" + root = "https://www.pixiv.net" directory_fmt = ("{category}", "{user[id]} {user[account]}") filename_fmt = "{id}_p{num}.{extension}" archive_fmt = "{id}{suffix}.{extension}" @@ -73,7 +73,14 @@ class PixivExtractor(Extractor): if work["type"] == "ugoira": if not self.load_ugoira: continue - ugoira = self.api.ugoira_metadata(work["id"]) + + try: + ugoira = self.api.ugoira_metadata(work["id"]) + except exception.StopExtraction as exc: + self.log.warning( + "Unable to retrieve Ugoira metatdata (%s - %s)", + work.get("id"), exc.message) + continue url = ugoira["zip_urls"]["medium"].replace( "_ugoira600x600", "_ugoira1920x1080") @@ -91,22 +98,70 @@ class PixivExtractor(Extractor): work["suffix"] = "_p{:02}".format(work["num"]) yield Message.Url, url, text.nameext_from_url(url, work) + @staticmethod + def _make_work(kind, url, user): + p = url.split("/") + return { + "create_date" : "{}-{}-{}T{}:{}:{}+09:00".format( + p[5], p[6], p[7], p[8], p[9], p[10]) if len(p) > 9 else None, + "height" : 0, + "id" : kind, + "image_urls" : None, + "meta_pages" : (), + "meta_single_page": {"original_image_url": url}, + "page_count" : 1, + "sanity_level" : 0, + "tags" : (), + "title" : kind, + "type" : kind, + "user" : user, + "width" : 0, + "x_restrict" : 0, + } + def works(self): - """Return an iterable containing all relevant 'work'-objects""" + """Return an iterable containing all relevant 'work' objects""" def metadata(self): - """Collect metadata for extractor-job""" + """Collect metadata for extractor job""" return {} class PixivUserExtractor(PixivExtractor): - """Extractor for works of a pixiv user""" + """Extractor for a pixiv user profile""" subcategory = "user" pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" - r"(?:en/)?users/(\d+)(?:/(?:artworks|illustrations|manga)" - r"(?:/([^/?#]+))?)?/?(?:$|[?#])" - r"|member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?" - r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))") + r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id=" + r")(\d+)(?:$|[?#])") + test = ( + ("https://www.pixiv.net/en/users/173530"), + ("https://www.pixiv.net/u/173530"), + ("https://www.pixiv.net/member.php?id=173530"), + ("https://www.pixiv.net/mypage.php#id=173530"), + ("https://www.pixiv.net/#id=173530"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id = match.group(1) + + def items(self): + base = "{}/users/{}/".format(self.root, self.user_id) + return self._dispatch_extractors(( + (PixivAvatarExtractor , base + "avatar"), + (PixivBackgroundExtractor, base + "background"), + (PixivArtworksExtractor , base + "artworks"), + (PixivFavoriteExtractor , base + "bookmarks/artworks"), + ), ("artworks",)) + + +class PixivArtworksExtractor(PixivExtractor): + """Extractor for artworks of a pixiv user""" + subcategory = "artworks" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" + r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" + r"(?:/([^/?#]+))?/?(?:$|[?#])" + r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") test = ( ("https://www.pixiv.net/en/users/173530/artworks", { "url": "852c31ad83b6840bacbce824d85f2a997889efb7", @@ -120,47 +175,30 @@ class PixivUserExtractor(PixivExtractor): "&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), { "url": "25b1cd81153a8ff82eec440dd9f20a4a22079658", }), - # avatar (#595, #623, #1124) - ("https://www.pixiv.net/en/users/173530", { - "options": (("avatar", True),), - "content": "4e57544480cc2036ea9608103e8f024fa737fe66", - "range": "1", - }), - # background (#623, #1124, #2495) - ("https://www.pixiv.net/en/users/194921", { - "options": (("background", True),), - "content": "aeda3536003ea3002f70657cb93c5053f26f5843", - "range": "1", - }), # deleted account ("http://www.pixiv.net/member_illust.php?id=173531", { "options": (("metadata", True),), "exception": exception.NotFoundError, }), - ("https://www.pixiv.net/en/users/173530"), ("https://www.pixiv.net/en/users/173530/manga"), ("https://www.pixiv.net/en/users/173530/illustrations"), ("https://www.pixiv.net/member_illust.php?id=173530"), - ("https://www.pixiv.net/u/173530"), - ("https://www.pixiv.net/user/173530"), - ("https://www.pixiv.net/mypage.php#id=173530"), - ("https://www.pixiv.net/#id=173530"), ("https://touch.pixiv.net/member_illust.php?id=173530"), ) def __init__(self, match): PixivExtractor.__init__(self, match) - u1, t1, u2, t2, u3 = match.groups() + u1, t1, u2, t2 = match.groups() if t1: t1 = text.unquote(t1) elif t2: t2 = text.parse_query(t2).get("tag") - self.user_id = u1 or u2 or u3 + self.user_id = u1 or u2 self.tag = t1 or t2 def metadata(self): if self.config("metadata"): - return {"user": self.api.user_detail(self.user_id)["user"]} + return self.api.user_detail(self.user_id) return {} def works(self): @@ -173,54 +211,60 @@ class PixivUserExtractor(PixivExtractor): if tag in [t["name"].lower() for t in work["tags"]] ) - avatar = self.config("avatar") - background = self.config("background") - if avatar or background: - work_list = [] - detail = self.api.user_detail(self.user_id) - user = detail["user"] - - if avatar: - url = user["profile_image_urls"]["medium"] - work_list.append((self._make_work( - "avatar", url.replace("_170.", "."), user),)) - - if background: - url = detail["profile"]["background_image_url"] - if url: - if "/c/" in url: - parts = url.split("/") - del parts[3:5] - url = "/".join(parts) - url = url.replace("_master1200.", ".") - work = self._make_work("background", url, user) - if url.endswith(".jpg"): - work["_fallback"] = (url[:-4] + ".png",) - work_list.append((work,)) - - work_list.append(works) - works = itertools.chain.from_iterable(work_list) - return works - @staticmethod - def _make_work(kind, url, user): - return { - "create_date" : None, - "height" : 0, - "id" : kind, - "image_urls" : None, - "meta_pages" : (), - "meta_single_page": {"original_image_url": url}, - "page_count" : 1, - "sanity_level" : 0, - "tags" : (), - "title" : kind, - "type" : kind, - "user" : user, - "width" : 0, - "x_restrict" : 0, - } + +class PixivAvatarExtractor(PixivExtractor): + """Extractor for pixiv avatars""" + subcategory = "avatar" + filename_fmt = "avatar{date:?_//%Y-%m-%d}.{extension}" + archive_fmt = "avatar_{user[id]}_{date}" + pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" + r"/(?:en/)?users/(\d+)/avatar") + test = ("https://www.pixiv.net/en/users/173530/avatar", { + "content": "4e57544480cc2036ea9608103e8f024fa737fe66", + }) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id = match.group(1) + + def works(self): + user = self.api.user_detail(self.user_id)["user"] + url = user["profile_image_urls"]["medium"].replace("_170.", ".") + return (self._make_work("avatar", url, user),) + + +class PixivBackgroundExtractor(PixivExtractor): + """Extractor for pixiv background banners""" + subcategory = "background" + filename_fmt = "background{date?_//:%Y-%m-%d}.{extension}" + archive_fmt = "background_{user[id]}_{date}" + pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" + r"/(?:en/)?users/(\d+)/background") + test = ("https://www.pixiv.net/en/users/194921/background", { + "pattern": r"https://i\.pximg\.net/background/img/2021/01/30/16/12/02" + r"/194921_af1f71e557a42f499213d4b9eaccc0f8\.jpg", + }) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id = match.group(1) + + def works(self): + detail = self.api.user_detail(self.user_id) + url = detail["profile"]["background_image_url"] + if not url: + return () + if "/c/" in url: + parts = url.split("/") + del parts[3:5] + url = "/".join(parts) + url = url.replace("_master1200.", ".") + work = self._make_work("background", url, detail["user"]) + if url.endswith(".jpg"): + work["_fallback"] = (url[:-4] + ".png",) + return (work,) class PixivMeExtractor(PixivExtractor): @@ -312,10 +356,10 @@ class PixivFavoriteExtractor(PixivExtractor): r"|bookmark\.php)(?:\?([^#]*))?") test = ( ("https://www.pixiv.net/en/users/173530/bookmarks/artworks", { - "url": "e717eb511500f2fa3497aaee796a468ecf685cc4", + "url": "85a3104eaaaf003c7b3947117ca2f1f0b1cfc949", }), ("https://www.pixiv.net/bookmark.php?id=173530", { - "url": "e717eb511500f2fa3497aaee796a468ecf685cc4", + "url": "85a3104eaaaf003c7b3947117ca2f1f0b1cfc949", }), # bookmarks with specific tag (("https://www.pixiv.net/en/users/3137110" @@ -735,66 +779,70 @@ class PixivAppAPI(): def illust_detail(self, illust_id): params = {"illust_id": illust_id} - return self._call("v1/illust/detail", params)["illust"] + return self._call("/v1/illust/detail", params)["illust"] def illust_follow(self, restrict="all"): params = {"restrict": restrict} - return self._pagination("v2/illust/follow", params) + return self._pagination("/v2/illust/follow", params) def illust_ranking(self, mode="day", date=None): params = {"mode": mode, "date": date} - return self._pagination("v1/illust/ranking", params) + return self._pagination("/v1/illust/ranking", params) def illust_related(self, illust_id): params = {"illust_id": illust_id} - return self._pagination("v2/illust/related", params) + return self._pagination("/v2/illust/related", params) def search_illust(self, word, sort=None, target=None, duration=None, date_start=None, date_end=None): params = {"word": word, "search_target": target, "sort": sort, "duration": duration, "start_date": date_start, "end_date": date_end} - return self._pagination("v1/search/illust", params) + return self._pagination("/v1/search/illust", params) def user_bookmarks_illust(self, user_id, tag=None, restrict="public"): params = {"user_id": user_id, "tag": tag, "restrict": restrict} - return self._pagination("v1/user/bookmarks/illust", params) + return self._pagination("/v1/user/bookmarks/illust", params) + @memcache(keyarg=1) def user_detail(self, user_id): params = {"user_id": user_id} - return self._call("v1/user/detail", params) + return self._call("/v1/user/detail", params) def user_following(self, user_id, restrict="public"): params = {"user_id": user_id, "restrict": restrict} - return self._pagination("v1/user/following", params, "user_previews") + return self._pagination("/v1/user/following", params, "user_previews") def user_illusts(self, user_id): params = {"user_id": user_id} - return self._pagination("v1/user/illusts", params) + return self._pagination("/v1/user/illusts", params) def ugoira_metadata(self, illust_id): params = {"illust_id": illust_id} - return self._call("v1/ugoira/metadata", params)["ugoira_metadata"] + return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"] def _call(self, endpoint, params=None): - url = "https://app-api.pixiv.net/" + endpoint + url = "https://app-api.pixiv.net" + endpoint + + while True: + self.login() + response = self.extractor.request(url, params=params, fatal=False) + data = response.json() - self.login() - response = self.extractor.request(url, params=params, fatal=False) - data = response.json() + if "error" not in data: + return data + + self.log.debug(data) - if "error" in data: if response.status_code == 404: raise exception.NotFoundError() error = data["error"] if "rate limit" in (error.get("message") or "").lower(): - self.log.info("Waiting two minutes for API rate limit reset.") - time.sleep(120) - return self._call(endpoint, params) - raise exception.StopExtraction("API request failed: %s", error) + self.extractor.wait(seconds=300) + continue - return data + raise exception.StopExtraction("API request failed: %s", error) def _pagination(self, endpoint, params, key="illusts"): while True: diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index b3a620a..db8d700 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -160,6 +160,7 @@ BASE_PATTERN = ReactorExtractor.update({ }, "thatpervert": { "root": "http://thatpervert.com", + "pattern": r"thatpervert\.com", }, }) diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index c8b8c9a..16b9191 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -24,6 +24,7 @@ class ReadcomiconlineBase(): archive_fmt = "{issue_id}_{page}" root = "https://readcomiconline.li" browser = "firefox" + request_interval = (1, 9) def request(self, url, **kwargs): """Detect and handle redirects to CAPTCHA pages""" @@ -85,7 +86,7 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): return [ (beau(url), None) for url in text.extract_iter( - page, 'lstImages.push("', '"' + page, "lstImages.push('", "'", ) ] @@ -129,10 +130,13 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): def beau(url): - """https://readcomiconline.li/Scripts/rguard.min.js?v=1.1""" + """https://readcomiconline.li/Scripts/rguard.min.js""" if url.startswith("https"): return url + url = url.replace("_x236", "d") + url = url.replace("_x945", "g") + containsS0 = "=s0" in url url = url[:-3 if containsS0 else -6] url = url[4:22] + url[25:] diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index f276e84..f2bf3cb 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -43,19 +43,45 @@ class ShopifyExtractor(BaseExtractor): BASE_PATTERN = ShopifyExtractor.update({ + "chelseacrew": { + "root": "https://chelseacrew.com", + "pattern": r"(?:www\.)?chelseacrew\.com", + }, "fashionnova": { "root": "https://www.fashionnova.com", "pattern": r"(?:www\.)?fashionnova\.com", }, + "loungeunderwear": { + "root": "https://loungeunderwear.com", + "pattern": r"(?:[a-z]+\.)?loungeunderwear\.com", + }, + "michaelscameras": { + "root": "https://michaels.com.au", + "pattern": r"michaels\.com\.au", + }, + "modcloth": { + "root": "https://modcloth.com", + "pattern": r"modcloth\.com", + }, "omgmiamiswimwear": { "root": "https://www.omgmiamiswimwear.com", + "pattern": r"(?:www\.)?omgmiamiswimwear\.com", + }, + "pinupgirlclothing": { + "root": "https://pinupgirlclothing.com", + "pattern": r"pinupgirlclothing\.com", + }, + "raidlondon": { + "root": "https://www.raidlondon.com", + "pattern": r"(?:www\.)?raidlondon\.com", + }, + "unique-vintage": { + "root": "https://www.unique-vintage.com", + "pattern": r"(?:www\.)?unique\-vintage\.com", }, "windsorstore": { "root": "https://www.windsorstore.com", - }, - "loungeunderwear": { - "root": "https://loungeunderwear.com", - "pattern": r"(?:[a-z]+\.)?loungeunderwear\.com", + "pattern": r"(?:www\.)?windsorstore\.com", }, }) @@ -66,15 +92,21 @@ class ShopifyCollectionExtractor(ShopifyExtractor): directory_fmt = ("{category}", "{collection[title]}") pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$|[?#])" test = ( + ("https://chelseacrew.com/collections/flats"), ("https://www.fashionnova.com/collections/mini-dresses", { "range": "1-20", "count": 20, }), ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), ("https://www.fashionnova.com/collections/mini-dresses#1"), + ("https://loungeunderwear.com/collections/apparel"), + ("https://michaels.com.au/collections/microphones"), + ("https://modcloth.com/collections/shoes"), ("https://www.omgmiamiswimwear.com/collections/fajas"), + ("https://pinupgirlclothing.com/collections/evening"), + ("https://www.raidlondon.com/collections/flats"), + ("https://www.unique-vintage.com/collections/flapper-1920s"), ("https://www.windsorstore.com/collections/dresses-ball-gowns"), - ("https://loungeunderwear.com/collections/apparel"), ) def metadata(self): @@ -99,18 +131,28 @@ class ShopifyProductExtractor(ShopifyExtractor): directory_fmt = ("{category}", "Products") pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)" test = ( + ("https://chelseacrew.com/collections/flats/products/dora"), ("https://www.fashionnova.com/products/essential-slide-red", { "pattern": r"https?://cdn\d*\.shopify.com/", "count": 3, }), + ("https://www.fashionnova.com/collections/flats/products/name"), + ("https://de.loungeunderwear.com/products/ribbed-crop-top-black"), + ("https://michaels.com.au/collections/audio/products" + "/boya-by-wm4-pro-k5-2-4ghz-mic-android-1-1-101281"), + ("https://modcloth.com/collections/shoes/products/heidii-brn"), ("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", { "pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/", "count": 5, }), - ("https://www.fashionnova.com/collections/flats/products/name"), + ("https://pinupgirlclothing.com/collections/evening/products" + "/clarice-coat-dress-in-olive-green-poly-crepe-laura-byrnes-design"), + ("https://www.raidlondon.com/collections/flats/products" + "/raid-addyson-chunky-flat-shoe-in-white"), + ("https://www.unique-vintage.com/collections/flapper-1920s/products" + "/unique-vintage-plus-size-black-silver-beaded-troyes-flapper-dress"), ("https://www.windsorstore.com/collections/accessories-belts/products" "/rhine-buckle-dbl-o-ring-pu-strap-belt-073010158001"), - ("https://de.loungeunderwear.com/products/ribbed-crop-top-black"), ) def products(self): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4c947e7..2737d34 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -15,7 +15,7 @@ import json BASE_PATTERN = ( r"(?:https?://)?(?:www\.|mobile\.)?" - r"(?:(?:fx)?twitter\.com|nitter\.net)" + r"(?:(?:[fv]x)?twitter\.com|nitter\.net)" ) @@ -39,7 +39,7 @@ class TwitterExtractor(Extractor): self.pinned = self.config("pinned", False) self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) - self.cards = self.config("cards", True) + self.cards = self.config("cards", False) self._user_cache = {} self._init_sizes() @@ -104,6 +104,7 @@ class TwitterExtractor(Extractor): def _extract_media(self, tweet, entities, files): for media in entities: + descr = media.get("ext_alt_text") width = media["original_info"].get("width", 0) height = media["original_info"].get("height", 0) @@ -112,9 +113,10 @@ class TwitterExtractor(Extractor): files.append({ "url": "ytdl:{}/i/web/status/{}".format( self.root, tweet["id_str"]), - "width" : width, - "height" : height, - "extension": None, + "width" : width, + "height" : height, + "extension" : None, + "description": descr, }) elif self.videos: video_info = media["video_info"] @@ -123,22 +125,24 @@ class TwitterExtractor(Extractor): key=lambda v: v.get("bitrate", 0), ) files.append({ - "url" : variant["url"], - "width" : width, - "height" : height, - "bitrate" : variant.get("bitrate", 0), - "duration": video_info.get( + "url" : variant["url"], + "width" : width, + "height" : height, + "bitrate" : variant.get("bitrate", 0), + "duration" : video_info.get( "duration_millis", 0) / 1000, + "description": descr, }) elif "media_url_https" in media: url = media["media_url_https"] base, _, fmt = url.rpartition(".") base += "?format=" + fmt + "&name=" files.append(text.nameext_from_url(url, { - "url" : base + self._size_image, - "width" : width, - "height" : height, - "_fallback": self._image_fallback(base), + "url" : base + self._size_image, + "width" : width, + "height" : height, + "_fallback" : self._image_fallback(base), + "description": descr, })) else: files.append({"url": media["media_url"]}) @@ -323,6 +327,9 @@ class TwitterExtractor(Extractor): elif userfmt == "media": cls = TwitterMediaExtractor fmt = (self.root + "/id:{rest_id}/media").format_map + elif userfmt == "tweets": + cls = TwitterTweetsExtractor + fmt = (self.root + "/id:{rest_id}/tweets").format_map else: cls = None fmt = userfmt.format_map @@ -383,7 +390,7 @@ class TwitterExtractor(Extractor): class TwitterTimelineExtractor(TwitterExtractor): - """Extractor for Tweets from a user's timeline""" + """Extractor for a Twitter user timeline""" subcategory = "timeline" pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") @@ -400,6 +407,8 @@ class TwitterTimelineExtractor(TwitterExtractor): ("https://www.twitter.com/id:2976459548"), ("https://twitter.com/i/user/2976459548"), ("https://twitter.com/intent/user?user_id=2976459548"), + ("https://fxtwitter.com/supernaturepics"), + ("https://vxtwitter.com/supernaturepics"), ) def __init__(self, match): @@ -409,6 +418,52 @@ class TwitterTimelineExtractor(TwitterExtractor): self.user = "id:" + user_id def tweets(self): + tweets = (self.api.user_tweets(self.user) if self.retweets else + self.api.user_media(self.user)) + + # yield initial batch of (media) tweets + tweet = None + for tweet in tweets: + yield tweet + + if tweet is None: + return + + # get username + if not self.user.startswith("id:"): + username = self.user + elif "core" in tweet: + username = (tweet["core"]["user_results"]["result"] + ["legacy"]["screen_name"]) + else: + username = tweet["user"]["screen_name"] + + # get tweet data + if "legacy" in tweet: + tweet = tweet["legacy"] + + # yield search results starting from last tweet id + yield from self.api.search_adaptive( + "from:{} include:retweets include:nativeretweets max_id:{} " + "filter:images OR card_name:animated_gif OR filter:native_video" + .format(username, tweet["id_str"]) + ) + + +class TwitterTweetsExtractor(TwitterExtractor): + """Extractor for Tweets from a user's Tweets timeline""" + subcategory = "tweets" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)" + test = ( + ("https://twitter.com/supernaturepics/tweets", { + "range": "1-40", + "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", + }), + ("https://mobile.twitter.com/supernaturepics/tweets#t"), + ("https://www.twitter.com/id:2976459548/tweets"), + ) + + def tweets(self): return self.api.user_tweets(self.user) @@ -662,6 +717,10 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("syndication", True),), "count": 1, }), + # media alt texts / descriptions (#2617) + ("https://twitter.com/my0nruri/status/1528379296041299968", { + "keyword": {"description": "oc"} + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 8fb9bbf..23f6ea2 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -9,7 +9,7 @@ """Extractors for https://vk.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" @@ -39,9 +39,15 @@ class VkExtractor(Extractor): self.log.warning("no photo URL found (%s)", photo.get("id")) continue - photo.update(data) - photo["url"], photo["width"], photo["height"] = photo[size] + try: + photo["url"], photo["width"], photo["height"] = photo[size] + except ValueError: + # photo without width/height entries (#2535) + photo["url"] = photo[size + "src"] + photo["width"] = photo["height"] = 0 + photo["id"] = photo["id"].rpartition("_")[2] + photo.update(data) text.nameext_from_url(photo["url"], photo) yield Message.Url, photo["url"], photo @@ -66,6 +72,10 @@ class VkExtractor(Extractor): url, method="POST", headers=headers, data=data, ).json()["payload"][1] + if len(payload) < 4: + self.log.debug(payload) + raise exception.AuthorizationError(payload[0]) + total = payload[1] photos = payload[3] @@ -105,7 +115,7 @@ class VkPhotosExtractor(VkExtractor): }, }), ("https://vk.com/cosplayinrussia", { - "range": "25-35", + "range": "15-25", "keywords": { "id": r"re:\d+", "user": { @@ -117,6 +127,12 @@ class VkPhotosExtractor(VkExtractor): }, }, }), + # photos without width/height (#2535) + ("https://vk.com/id76957806", { + "pattern": r"https://sun\d+-\d+\.userapi\.com/", + "range": "1-9", + "count": 9, + }), ("https://m.vk.com/albums398982326"), ("https://www.vk.com/id398982326?profile=1"), ("https://vk.com/albums-165740836"), @@ -150,7 +166,8 @@ class VkPhotosExtractor(VkExtractor): '<h1 class="page_name">', "<")).replace(" ", " "), "info": text.unescape(text.remove_html(extr( '<span class="current_text">', '</span'))), - "id" : extr('<a href="/albums', '"'), + "id" : (extr('<a href="/albums', '"') or + extr('data-from-id="', '"')), }} @@ -166,6 +183,10 @@ class VkAlbumExtractor(VkExtractor): ("https://vk.com/album-165740836_281339889", { "count": 12, }), + # "Access denied" (#2556) + ("https://vk.com/album-53775183_00", { + "exception": exception.AuthorizationError, + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index 75b78c5..599a175 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -47,6 +47,7 @@ class WeasylExtractor(Extractor): return data def submissions(self, owner_login, folderid=None): + metadata = self.config("metadata") url = "{}/api/users/{}/gallery".format(self.root, owner_login) params = { "nextid" : None, @@ -56,6 +57,9 @@ class WeasylExtractor(Extractor): while True: data = self.request(url, params=params).json() for submission in data["submissions"]: + if metadata: + submission = self.request_submission( + submission["submitid"]) if self.populate_submission(submission): submission["folderid"] = folderid # Do any submissions have more than one url? If so diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index cf5b192..59f46f0 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2020 Leonardo Taccari +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -41,8 +42,8 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): """Extractor for an episode on webtoons.com""" subcategory = "episode" directory_fmt = ("{category}", "{comic}") - filename_fmt = "{episode}-{num:>02}.{extension}" - archive_fmt = "{title_no}_{episode}_{num}" + filename_fmt = "{episode_no}-{num:>02}.{extension}" + archive_fmt = "{title_no}_{episode_no}_{num}" pattern = (BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)/(?:[^/?#]+))" r"/viewer(?:\?([^#'\"]+))") test = ( @@ -54,6 +55,18 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): "49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9"), "count": 5, }), + (("https://www.webtoons.com/en/challenge/punderworld" + "/happy-earth-day-/viewer?title_no=312584&episode_no=40"), { + "keyword": { + "comic": "punderworld", + "description": str, + "episode": "36", + "episode_no": "40", + "genre": "challenge", + "title": r"re:^Punderworld - .+", + "title_no": "312584", + }, + }), ) def __init__(self, match): @@ -65,11 +78,13 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): query = text.parse_query(query) self.title_no = query.get("title_no") - self.episode = query.get("episode_no") + self.episode_no = query.get("episode_no") def metadata(self, page): + keywords, pos = text.extract( + page, '<meta name="keywords" content="', '"') title, pos = text.extract( - page, '<meta property="og:title" content="', '"') + page, '<meta property="og:title" content="', '"', pos) descr, pos = text.extract( page, '<meta property="og:description" content="', '"', pos) @@ -77,8 +92,9 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): "genre" : self.genre, "comic" : self.comic, "title_no" : self.title_no, - "episode" : self.episode, + "episode_no" : self.episode_no, "title" : text.unescape(title), + "episode" : keywords.split(", ")[1], "description": text.unescape(descr), "lang" : self.lang, "language" : util.code_to_language(self.lang), diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index 27d5e40..d1b3a8a 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -20,6 +20,7 @@ _CACHE = {} _CONVERSIONS = None _GLOBALS = { "_env": lambda: os.environ, + "_lit": lambda: _literal, "_now": datetime.datetime.now, } @@ -219,6 +220,10 @@ def parse_field_name(field_name): first, rest = _string.formatter_field_name_split(field_name) funcs = [] + if first[0] == "'": + funcs.append(operator.itemgetter(first[1:-1])) + first = "_lit" + for is_attr, key in rest: if is_attr: func = operator.attrgetter @@ -344,3 +349,15 @@ def _default_format(format_spec): def wrap(obj): return format(obj, format_spec) return wrap + + +class Literal(): + # __getattr__, __getattribute__, and __class_getitem__ + # are all slower than regular __getitem__ + + @staticmethod + def __getitem__(key): + return key + + +_literal = Literal() diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 044369a..a0adffb 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -16,6 +16,7 @@ import collections from . import extractor, downloader, postprocessor from . import config, text, util, path, formatter, output, exception from .extractor.message import Message +from .output import stdout_write class Job(): @@ -264,7 +265,7 @@ class DownloadJob(Job): # download succeeded pathfmt.finalize() - self.out.success(pathfmt.path, 0) + self.out.success(pathfmt.path) self._skipcnt = 0 if archive: archive.add(kwdict) @@ -537,14 +538,14 @@ class KeywordJob(Job): self.private = config.get(("output",), "private") def handle_url(self, url, kwdict): - print("\nKeywords for filenames and --filter:") - print("------------------------------------") + stdout_write("\nKeywords for filenames and --filter:\n" + "------------------------------------\n") self.print_kwdict(kwdict) raise exception.StopExtraction() def handle_directory(self, kwdict): - print("Keywords for directory names:") - print("-----------------------------") + stdout_write("Keywords for directory names:\n" + "-----------------------------\n") self.print_kwdict(kwdict) def handle_queue(self, url, kwdict): @@ -565,36 +566,47 @@ class KeywordJob(Job): self.extractor.log.info( "Try 'gallery-dl -K \"%s\"' instead.", url) else: - print("Keywords for --chapter-filter:") - print("------------------------------") + stdout_write("Keywords for --chapter-filter:\n" + "------------------------------\n") self.print_kwdict(kwdict) if extr or self.extractor.categorytransfer: - print() + stdout_write("\n") KeywordJob(extr or url, self).run() raise exception.StopExtraction() - def print_kwdict(self, kwdict, prefix=""): + def print_kwdict(self, kwdict, prefix="", markers=None): """Print key-value pairs in 'kwdict' with formatting""" + write = sys.stdout.write suffix = "]" if prefix else "" + + markerid = id(kwdict) + if markers is None: + markers = {markerid} + elif markerid in markers: + write("{}\n <circular reference>\n".format(prefix[:-1])) + return # ignore circular reference + else: + markers.add(markerid) + for key, value in sorted(kwdict.items()): if key[0] == "_" and not self.private: continue key = prefix + key + suffix if isinstance(value, dict): - self.print_kwdict(value, key + "[") + self.print_kwdict(value, key + "[", markers) elif isinstance(value, list): if value and isinstance(value[0], dict): - self.print_kwdict(value[0], key + "[][") + self.print_kwdict(value[0], key + "[][", markers) else: - print(key, "[]", sep="") + write(key + "[]\n") for val in value: - print(" -", val) + write(" - " + str(val) + "\n") else: # string or number - print(key, "\n ", value, sep="") + write("{}\n {}\n".format(key, value)) class UrlJob(Job): @@ -609,14 +621,14 @@ class UrlJob(Job): @staticmethod def handle_url(url, _): - print(url) + stdout_write(url + "\n") @staticmethod def handle_url_fallback(url, kwdict): - print(url) + stdout_write(url + "\n") if "_fallback" in kwdict: for url in kwdict["_fallback"]: - print("|", url) + stdout_write("| " + url + "\n") def handle_queue(self, url, kwdict): cls = kwdict.get("_extractor") @@ -653,15 +665,18 @@ class InfoJob(Job): return 0 def _print_multi(self, title, *values): - print(title, "\n ", " / ".join(json.dumps(v) for v in values), sep="") + stdout_write("{}\n {}\n\n".format( + title, " / ".join(json.dumps(v) for v in values))) def _print_config(self, title, optname, value): optval = self.extractor.config(optname, util.SENTINEL) if optval is not util.SENTINEL: - print(title, "(custom):\n ", json.dumps(optval)) - print(title, "(default):\n ", json.dumps(value)) + stdout_write( + "{} (custom):\n {}\n{} (default):\n {}\n\n".format( + title, json.dumps(optval), title, json.dumps(value))) elif value: - print(title, "(default):\n ", json.dumps(value)) + stdout_write( + "{} (default):\n {}\n\n".format(title, json.dumps(value))) class DataJob(Job): diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 782063d..b2a9aa8 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -39,8 +39,9 @@ class AppendCommandAction(argparse.Action): class DeprecatedConfigConstAction(argparse.Action): """Set argparse const values as config values + deprecation warning""" def __call__(self, parser, namespace, values, option_string=None): - print("warning: {} is deprecated. Use {} instead.".format( - "/".join(self.option_strings), self.choices), file=sys.stderr) + sys.stderr.write( + "warning: {} is deprecated. Use {} instead.\n".format( + "/".join(self.option_strings), self.choices)) namespace.options.append(((), self.dest, self.const)) @@ -59,7 +60,7 @@ class ParseAction(argparse.Action): class Formatter(argparse.HelpFormatter): """Custom HelpFormatter class to customize help output""" def __init__(self, *args, **kwargs): - super().__init__(max_help_position=50, *args, **kwargs) + super().__init__(max_help_position=30, *args, **kwargs) def _format_action_invocation(self, action): opts = action.option_strings[:] @@ -114,11 +115,6 @@ def build_parser(): "('/O' for \"original\" filenames)"), ) general.add_argument( - "--cookies", - dest="cookies", metavar="FILE", action=ConfigAction, - help="File to load additional cookies from", - ) - general.add_argument( "--proxy", dest="proxy", metavar="URL", action=ConfigAction, help="Use the specified proxy", @@ -134,6 +130,18 @@ def build_parser(): help="Delete cached login sessions, cookies, etc. for MODULE " "(ALL to delete everything)", ) + general.add_argument( + "--cookies", + dest="cookies", metavar="FILE", action=ConfigAction, + help="File to load additional cookies from", + ) + general.add_argument( + "--cookies-from_browser", + dest="cookies_from_browser", metavar="BROWSER[+KEYRING][:PROFILE]", + help=("Name of the browser to load cookies from, " + "with optional keyring name prefixed with '+' and " + "profile prefixed with ':'"), + ) output = parser.add_argument_group("Output Options") output.add_argument( diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 7e00e1a..3531304 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -205,6 +205,30 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): # -------------------------------------------------------------------- # Utility functions +def stdout_write_flush(s): + sys.stdout.write(s) + sys.stdout.flush() + + +def stderr_write_flush(s): + sys.stderr.write(s) + sys.stderr.flush() + + +if sys.stdout.line_buffering: + def stdout_write(s): + sys.stdout.write(s) +else: + stdout_write = stdout_write_flush + + +if sys.stderr.line_buffering: + def stderr_write(s): + sys.stderr.write(s) +else: + stderr_write = stderr_write_flush + + def replace_std_streams(errors="replace"): """Replace standard streams and set their error handlers to 'errors'""" for name in ("stdout", "stdin", "stderr"): @@ -255,7 +279,7 @@ class NullOutput(): def skip(self, path): """Print a message indicating that a download has been skipped""" - def success(self, path, tries): + def success(self, path): """Print a message indicating the completion of a download""" def progress(self, bytes_total, bytes_downloaded, bytes_per_second): @@ -265,14 +289,10 @@ class NullOutput(): class PipeOutput(NullOutput): def skip(self, path): - stdout = sys.stdout - stdout.write(CHAR_SKIP + path + "\n") - stdout.flush() + stdout_write(CHAR_SKIP + path + "\n") - def success(self, path, tries): - stdout = sys.stdout - stdout.write(path + "\n") - stdout.flush() + def success(self, path): + stdout_write(path + "\n") class TerminalOutput(NullOutput): @@ -288,38 +308,43 @@ class TerminalOutput(NullOutput): self.shorten = util.identity def start(self, path): - stdout = sys.stdout - stdout.write(self.shorten(" " + path)) - stdout.flush() + stdout_write_flush(self.shorten(" " + path)) def skip(self, path): - sys.stdout.write(self.shorten(CHAR_SKIP + path) + "\n") + stdout_write(self.shorten(CHAR_SKIP + path) + "\n") - def success(self, path, tries): - sys.stdout.write("\r" + self.shorten(CHAR_SUCCESS + path) + "\n") + def success(self, path): + stdout_write("\r" + self.shorten(CHAR_SUCCESS + path) + "\n") def progress(self, bytes_total, bytes_downloaded, bytes_per_second): bdl = util.format_value(bytes_downloaded) bps = util.format_value(bytes_per_second) if bytes_total is None: - sys.stderr.write("\r{:>7}B {:>7}B/s ".format(bdl, bps)) + stderr_write("\r{:>7}B {:>7}B/s ".format(bdl, bps)) else: - sys.stderr.write("\r{:>3}% {:>7}B {:>7}B/s ".format( + stderr_write("\r{:>3}% {:>7}B {:>7}B/s ".format( bytes_downloaded * 100 // bytes_total, bdl, bps)) class ColorOutput(TerminalOutput): + def __init__(self): + TerminalOutput.__init__(self) + + colors = config.get(("output",), "colors") or {} + self.color_skip = "\033[{}m".format( + colors.get("skip", "2")) + self.color_success = "\r\033[{}m".format( + colors.get("success", "1;32")) + def start(self, path): - stdout = sys.stdout - stdout.write(self.shorten(path)) - stdout.flush() + stdout_write_flush(self.shorten(path)) def skip(self, path): - sys.stdout.write("\033[2m" + self.shorten(path) + "\033[0m\n") + stdout_write(self.color_skip + self.shorten(path) + "\033[0m\n") - def success(self, path, tries): - sys.stdout.write("\r\033[1;32m" + self.shorten(path) + "\033[0m\n") + def success(self, path): + stdout_write(self.color_success + self.shorten(path) + "\033[0m\n") class EAWCache(dict): diff --git a/gallery_dl/path.py b/gallery_dl/path.py index c85bb88..84ee7af 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -74,7 +74,7 @@ class PathFormat(): self.directory = self.realdirectory = \ self.filename = self.extension = self.prefix = \ self.path = self.realpath = self.temppath = "" - self.delete = self._create_directory = False + self.delete = False extension_map = config("extension-map") if extension_map is None: @@ -138,7 +138,11 @@ class PathFormat(): def open(self, mode="wb"): """Open file and return a corresponding file object""" - return open(self.temppath, mode) + try: + return open(self.temppath, mode) + except FileNotFoundError: + os.makedirs(self.realdirectory) + return open(self.temppath, mode) def exists(self): """Return True if the file exists on disk""" @@ -187,7 +191,6 @@ class PathFormat(): directory += sep self.realdirectory = directory - self._create_directory = True def set_filename(self, kwdict): """Set general filename data""" @@ -279,9 +282,6 @@ class PathFormat(): def build_path(self): """Combine directory and filename to full paths""" - if self._create_directory: - os.makedirs(self.realdirectory, exist_ok=True) - self._create_directory = False self.filename = filename = self.build_filename(self.kwdict) self.path = self.directory + filename self.realpath = self.realdirectory + filename @@ -317,11 +317,18 @@ class PathFormat(): if self.temppath != self.realpath: # Move temp file to its actual location - try: - os.replace(self.temppath, self.realpath) - except OSError: - shutil.copyfile(self.temppath, self.realpath) - os.unlink(self.temppath) + while True: + try: + os.replace(self.temppath, self.realpath) + except FileNotFoundError: + # delayed directory creation + os.makedirs(self.realdirectory) + continue + except OSError: + # move across different filesystems + shutil.copyfile(self.temppath, self.realpath) + os.unlink(self.temppath) + break mtime = self.kwdict.get("_mtime") if mtime: diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 5e8f3e9..4e86239 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -95,7 +95,7 @@ class MetadataPP(PostProcessor): with open(path, "w", encoding="utf-8") as fp: self.write(fp, pathfmt.kwdict) except FileNotFoundError: - os.makedirs(directory, exist_ok=True) + os.makedirs(directory) with open(path, "w", encoding="utf-8") as fp: self.write(fp, pathfmt.kwdict) diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index fb57e84..0b4c259 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -119,6 +119,9 @@ class UgoiraPP(PostProcessor): if self.args: args += self.args + # ensure target directory exists + os.makedirs(pathfmt.realdirectory, exist_ok=True) + # invoke ffmpeg try: if self.twopass: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 4bb220a..009ee08 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -302,9 +302,9 @@ def set_mtime(path, mtime): pass -def load_cookiestxt(fp): - """Parse a Netscape cookies.txt file and return a list of its Cookies""" - cookies = [] +def cookiestxt_load(fp, cookiejar): + """Parse a Netscape cookies.txt file and add its Cookies to 'cookiejar'""" + set_cookie = cookiejar.set_cookie for line in fp: @@ -321,11 +321,12 @@ def load_cookiestxt(fp): domain, domain_specified, path, secure, expires, name, value = \ line.split("\t") + if not name: name = value value = None - cookies.append(Cookie( + set_cookie(Cookie( 0, name, value, None, False, domain, @@ -337,12 +338,11 @@ def load_cookiestxt(fp): False, None, None, {}, )) - return cookies - -def save_cookiestxt(fp, cookies): +def cookiestxt_store(fp, cookies): """Write 'cookies' in Netscape cookies.txt format to 'fp'""" - fp.write("# Netscape HTTP Cookie File\n\n") + write = fp.write + write("# Netscape HTTP Cookie File\n\n") for cookie in cookies: if not cookie.domain: @@ -355,15 +355,15 @@ def save_cookiestxt(fp, cookies): name = cookie.name value = cookie.value - fp.write("\t".join(( + write("\t".join(( cookie.domain, "TRUE" if cookie.domain.startswith(".") else "FALSE", cookie.path, "TRUE" if cookie.secure else "FALSE", "0" if cookie.expires is None else str(cookie.expires), name, - value, - )) + "\n") + value + "\n", + ))) def code_to_language(code, default=None): @@ -695,12 +695,18 @@ class ExtendedUrl(): class DownloadArchive(): def __init__(self, path, format_string, cache_key="_archive_key"): - con = sqlite3.connect(path, timeout=60, check_same_thread=False) + try: + con = sqlite3.connect(path, timeout=60, check_same_thread=False) + except sqlite3.OperationalError: + os.makedirs(os.path.dirname(path)) + con = sqlite3.connect(path, timeout=60, check_same_thread=False) con.isolation_level = None self.close = con.close self.cursor = con.cursor() - self.keygen = format_string.format_map + + from . import formatter + self.keygen = formatter.parse(format_string).format_map self._cache_key = cache_key try: diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 624f288..1881291 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.21.2" +__version__ = "1.22.0" diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index 45b9826..b2da445 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -395,9 +395,6 @@ def parse_command_line(module, argv): "allow_multiple_audio_streams": opts.allow_multiple_audio_streams, "check_formats": getattr( opts, "check_formats", None), - "listformats": opts.listformats, - "listformats_table": getattr( - opts, "listformats_table", None), "outtmpl": opts.outtmpl, "outtmpl_na_placeholder": opts.outtmpl_na_placeholder, "paths": getattr(opts, "paths", None), @@ -448,7 +445,6 @@ def parse_command_line(module, argv): "writesubtitles": opts.writesubtitles, "writeautomaticsub": opts.writeautomaticsub, "allsubtitles": opts.allsubtitles, - "listsubtitles": opts.listsubtitles, "subtitlesformat": opts.subtitlesformat, "subtitleslangs": opts.subtitleslangs, "matchtitle": module.decodeOption(opts.matchtitle), |
