diff options
Diffstat (limited to 'gallery_dl/extractor')
25 files changed, 380 insertions, 233 deletions
diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index 50dbfe8..d3e9276 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -72,7 +72,6 @@ class _35photoExtractor(Extractor): "user" : data["user_login"], "user_id" : data["user_id"], "user_name" : data["user_name"], - "other" : data["otherData"], } if "series" in data: @@ -89,6 +88,8 @@ class _35photoExtractor(Extractor): def _photo_ids(page): """Extract unique photo IDs and return them as sorted list""" # searching for photo-id="..." doesn't always work (see unit tests) + if not page: + return () return sorted( set(text.extract_iter(page, "/photo_", "/")), key=text.parse_int, @@ -100,7 +101,7 @@ class _35photoUserExtractor(_35photoExtractor): """Extractor for all images of a user on 35photo.pro""" subcategory = "user" pattern = (r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro" - r"/(?!photo_|genre_)([^/?&#]+)") + r"/(?!photo_|genre_|rating/)([^/?&#]+)") test = ( ("https://35photo.pro/liya", { "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg", @@ -146,7 +147,14 @@ class _35photoGenreExtractor(_35photoExtractor): ("https://35photo.pro/genre_109/", { "range": "1-30", }), - ("https://35photo.pro/genre_109/new/"), + ("https://35photo.pro/genre_103/", { + "range": "1-30", + "count": 30, + }), + ("https://35photo.pro/genre_103/new/", { + "range": "1-30", + "count": 30, + }), ) def __init__(self, match): @@ -165,6 +173,8 @@ class _35photoGenreExtractor(_35photoExtractor): } def photos(self): + if not self.photo_ids: + return () return self._pagination({ "page": "genre", "community_id": self.genre_id, @@ -193,7 +203,6 @@ class _35photoImageExtractor(_35photoExtractor): "user" : "liya", "user_id" : 20415, "user_name" : "Liya Mirzaeva", - "other" : str, }, }) diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 00b8ab5..07c2e14 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -31,8 +31,7 @@ class _500pxExtractor(Extractor): for photo in self.photos(): url = photo["images"][-1]["url"] - fmt = photo["image_format"] - photo["extension"] = "jpg" if fmt == "jpeg" else fmt + photo["extension"] = photo["image_format"] if data: photo.update(data) if first: @@ -59,7 +58,7 @@ class _500pxExtractor(Extractor): "include_releases" : "true", "liked_by" : "1", "following_sample" : "100", - "image_size" : "32768", + "image_size" : "4096", "ids" : ",".join(str(p["id"]) for p in photos), } @@ -90,7 +89,7 @@ class _500pxUserExtractor(_500pxExtractor): pattern = (r"(?:https?://)?500px\.com" r"/(?!photo/)([^/?&#]+)/?(?:$|\?|#)") test = ("https://500px.com/light_expression_photography", { - "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D5000/v2", + "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D4096/v2", "range": "1-99", "count": 99, }) @@ -124,7 +123,7 @@ class _500pxGalleryExtractor(_500pxExtractor): pattern = (r"(?:https?://)?500px\.com" r"/(?!photo/)([^/?&#]+)/galleries/([^/?&#]+)") test = ("https://500px.com/fashvamp/galleries/lera", { - "url": "8a520272ece83278166b4f8556f9c9da43c43c45", + "url": "002dc81dee5b4a655f0e31ad8349e8903b296df6", "count": 3, "keyword": { "gallery": dict, @@ -144,7 +143,7 @@ class _500pxGalleryExtractor(_500pxExtractor): page = self.request(url).text self.csrf_token, pos = text.extract(page, 'csrf-token" content="', '"') self.user_id , pos = text.extract(page, 'App.CuratorId =', '\n', pos) - self.user_id = self.user_id.strip() + self.user_id = self.user_id.strip(" '\";") # get gallery metadata; transform gallery name into id url = "https://api.500px.com/v1/users/{}/galleries/{}".format( @@ -174,37 +173,30 @@ class _500pxImageExtractor(_500pxExtractor): subcategory = "image" pattern = r"(?:https?://)?500px\.com/photo/(\d+)" test = ("https://500px.com/photo/222049255/queen-of-coasts", { - "url": "d1eda7afeaa589f71f05b9bb5c0694e3ffb357cd", + "url": "fbdf7df39325cae02f5688e9f92935b0e7113315", "count": 1, "keyword": { "camera": "Canon EOS 600D", "camera_info": dict, - "collections_count": int, "comments": list, "comments_count": int, - "converted": False, - "converted_bits": int, - "created_at": "2017-08-01T04:40:05-04:00", - "crop_version": 0, + "created_at": "2017-08-01T08:40:05+00:00", "description": str, - "editored_by": dict, + "editored_by": None, "editors_choice": False, "extension": "jpg", - "favorites_count": int, "feature": "popular", "feature_date": "2017-08-01T09:58:28+00:00", "focal_length": "208", "height": 3111, "id": 222049255, - "image_format": "jpeg", - "image_url": str, + "image_format": "jpg", + "image_url": list, "images": list, "iso": "100", "lens": "EF-S55-250mm f/4-5.6 IS II", "lens_info": dict, - "license_type": 0, - "licensed_at": None, - "liked": False, + "liked": None, "location": None, "location_details": dict, "name": "Queen Of Coasts", @@ -212,15 +204,11 @@ class _500pxImageExtractor(_500pxExtractor): "privacy": False, "profile": True, "rating": float, - "sales_count": int, "status": 1, - "store_download": False, - "store_height": 3111, - "store_width": 4637, "tags": list, - "taken_at": "2017-05-04T13:36:51-04:00", + "taken_at": "2017-05-04T17:36:51+00:00", "times_viewed": int, - "url": "/photo/222049255/queen-of-coasts-by-olesya-nabieva", + "url": "/photo/222049255/Queen-Of-Coasts-by-Olesya-Nabieva", "user": dict, "user_id": 12847235, "votes_count": int, diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 81d480e..189c163 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,6 +24,7 @@ modules = [ "deviantart", "dynastyscans", "e621", + "erolord", "exhentai", "fallenangels", "flickr", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 24197ad..f7b3bc1 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -79,9 +79,7 @@ class ArtstationExtractor(Extractor): def get_user_info(self, username): """Return metadata for a specific user""" url = "{}/users/{}/quick.json".format(self.root, username.lower()) - response = self.request(url, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("user") + response = self.request(url, notfound="user") return response.json() def _pagination(self, url, params=None): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 175af63..5c40e2a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -13,6 +13,7 @@ import time import netrc import queue import logging +import datetime import requests import threading import http.cookiejar @@ -39,10 +40,13 @@ class Extractor(): self._init_headers() self._init_cookies() self._init_proxies() - self._retries = self.config("retries", 5) + self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) + if self._retries < 0: + self._retries = float("inf") + @classmethod def from_url(cls, url): if isinstance(cls.pattern, str): @@ -63,11 +67,11 @@ class Extractor(): return config.interpolate( ("extractor", self.category, self.subcategory, key), default) - def request(self, url, method="GET", *, session=None, - encoding=None, expect=(), retries=None, **kwargs): - tries = 0 - retries = retries or self._retries - session = session or self.session + def request(self, url, method="GET", *, session=None, retries=None, + encoding=None, fatal=True, notfound=None, **kwargs): + tries = 1 + retries = self._retries if retries is None else retries + session = self.session if session is None else session kwargs.setdefault("timeout", self._timeout) kwargs.setdefault("verify", self._verify) @@ -83,26 +87,37 @@ class Extractor(): raise exception.HttpError(exc) else: code = response.status_code - if 200 <= code < 400 or code in expect: + if 200 <= code < 400 or not fatal and \ + (400 <= code < 429 or 431 <= code < 500): if encoding: response.encoding = encoding return response + if notfound and code == 404: + raise exception.NotFoundError(notfound) if cloudflare.is_challenge(response): self.log.info("Solving Cloudflare challenge") url, domain, cookies = cloudflare.solve_challenge( session, response, kwargs) cloudflare.cookies.update(self.category, (domain, cookies)) continue + if cloudflare.is_captcha(response): + try: + import OpenSSL # noqa + except ImportError: + msg = " - Install 'pyOpenSSL' and try again" + else: + msg = "" + self.log.warning("Cloudflare CAPTCHA" + msg) msg = "{}: {} for url: {}".format(code, response.reason, url) - if code < 500 and code != 429: + if code < 500 and code != 429 and code != 430: break - tries += 1 - self.log.debug("%s (%d/%d)", msg, tries, retries) - if tries >= retries: + self.log.debug("%s (%s/%s)", msg, tries, retries+1) + if tries > retries: break - time.sleep(2 ** tries) + time.sleep(min(2 ** (tries-1), 1800)) + tries += 1 raise exception.HttpError(msg) @@ -130,8 +145,8 @@ class Extractor(): headers.clear() headers["User-Agent"] = self.config( - "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) " - "Gecko/20100101 Firefox/62.0")) + "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) " + "Gecko/20100101 Firefox/68.0")) headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" headers["Accept-Encoding"] = "gzip, deflate" @@ -203,6 +218,20 @@ class Extractor(): return False return True + def _get_date_min_max(self, dmin=None, dmax=None): + """Retrieve and parse 'date-min' and 'date-max' config values""" + def get(key, default): + ts = self.config(key, default) + if isinstance(ts, str): + try: + ts = int(datetime.datetime.strptime(ts, fmt).timestamp()) + except ValueError as exc: + self.log.warning("Unable to parse '%s': %s", key, exc) + ts = default + return ts + fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S") + return get("date-min", dmin), get("date-max", dmax) + @classmethod def _get_tests(cls): """Yield an extractor's test cases as (URL, RESULTS) tuples""" @@ -403,30 +432,36 @@ def generate_extractors(extractor_data, symtable, classes): http.cookiejar.MozillaCookieJar.magic_re = re.compile( "#( Netscape)? HTTP Cookie File", re.IGNORECASE) -# Update default cipher list of urllib3 -# to fix issues with Cloudflare and, by extension, Artstation (#227) -from requests.packages.urllib3.util import ssl_ # noqa -logging.getLogger("gallery-dl").debug("updating default urllib3 ciphers") - -# cipher list taken from urllib3 1.25 -# https://github.com/urllib3/urllib3/blob/1.25/src/urllib3/util/ssl_.py -# with additions from -# https://github.com/Anorov/cloudflare-scrape/pull/242 -ssl_.DEFAULT_CIPHERS = ( - "ECDHE+AESGCM:" - "ECDHE+CHACHA20:" - "DHE+AESGCM:" - "DHE+CHACHA20:" - "ECDH+AESGCM:" - "DH+AESGCM:" - "ECDH+AES:" - "DH+AES:" - "RSA+AESGCM:" - "RSA+AES:" - "!ECDHE+SHA:" - "!AES128-SHA:" - "!aNULL:" - "!eNULL:" - "!MD5:" - "!DSS" -) +# Replace default cipher list of urllib3 to avoid Cloudflare CAPTCHAs +ciphers = config.get(("ciphers",), True) +if ciphers: + logging.getLogger("gallery-dl").debug("Updating urllib3 ciphers") + + if ciphers is True: + ciphers = ( + # Firefox's list + "TLS_AES_128_GCM_SHA256:" + "TLS_CHACHA20_POLY1305_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-AES256-SHA:" + "ECDHE-ECDSA-AES128-SHA:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "DHE-RSA-AES128-SHA:" + "DHE-RSA-AES256-SHA:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + ) + elif isinstance(ciphers, list): + ciphers = ":".join(ciphers) + + from requests.packages.urllib3.util import ssl_ # noqa + ssl_.DEFAULT_CIPHERS = ciphers + del ssl_ diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ebab040..63e2913 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -205,8 +205,7 @@ class DeviantartExtractor(Extractor): @staticmethod def _find_folder(folders, name): - pattern = re.compile( - r"[^\w]*" + name.replace("-", r"[^\w]+") + r"[^\w]*$") + pattern = re.compile(r"(?i)\W*" + name.replace("-", r"\W+") + r"\W*$") for folder in folders: if pattern.match(folder["name"]): return folder @@ -416,7 +415,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def deviations(self): url = "{}/{}/{}".format(self.root, self.user, self.path) - response = self._html_request(url, expect=range(400, 500)) + response = self._html_request(url, fatal=False) deviation_id = text.extract(response.text, '//deviation/', '"')[0] if response.status_code >= 400 or not deviation_id: raise exception.NotFoundError("image") @@ -767,7 +766,7 @@ class DeviantartAPI(): def user_profile(self, username): """Get user profile information""" endpoint = "user/profile/" + username - return self._call(endpoint, expect_error=True) + return self._call(endpoint, fatal=False) def authenticate(self, refresh_token): """Authenticate the application by requesting an access token""" @@ -797,7 +796,7 @@ class DeviantartAPI(): _refresh_token_cache.update(refresh_token, data["refresh_token"]) return "Bearer " + data["access_token"] - def _call(self, endpoint, params=None, expect_error=False, public=True): + def _call(self, endpoint, params=None, fatal=True, public=True): """Call an API endpoint""" url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint while True: @@ -806,11 +805,7 @@ class DeviantartAPI(): self.authenticate(None if public else self.refresh_token) response = self.extractor.request( - url, - params=params, - headers=self.headers, - expect=range(400, 500), - ) + url, headers=self.headers, params=params, fatal=False) data = response.json() status = response.status_code @@ -818,7 +813,7 @@ class DeviantartAPI(): if self.delay > self.delay_min: self.delay -= 1 return data - if expect_error: + if not fatal: return None if data.get("error_description") == "User not found.": raise exception.NotFoundError("user or group") diff --git a/gallery_dl/extractor/erolord.py b/gallery_dl/extractor/erolord.py new file mode 100644 index 0000000..8628039 --- /dev/null +++ b/gallery_dl/extractor/erolord.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://erolord.com/""" + +from .common import GalleryExtractor +from .. import text, util +import json + + +class ErolordGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from erolord.com""" + category = "erolord" + root = "http://erolord.com" + pattern = r"(?:https?://)?(?:www\.)?erolord.com(/doujin/(\d+)/?)" + test = ("http://erolord.com/doujin/2189055/", { + "url": "7ce6d10a3934102b95c9718a34ccd3d35f55d85f", + "keyword": { + "title" : "Amazon No Hiyaku | Amazon Elixir", + "gallery_id": 2189055, + "count" : 16, + "artist" : ["Morris"], + "group" : list, + "parody" : list, + "characters": list, + "tags" : list, + "lang" : "en", + "language" : "English", + }, + }) + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + def metadata(self, page): + extr = text.extract_from(page) + split = text.split_html + title, _, language = extr('<h1 class="t64">', '</h1>').rpartition(" ") + language = language.strip("[]") + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(title), + # double quotes for anime, circle, tags + # single quotes for characters, artist + "parody" : split(extr('class="sp1">Anime:' , "</div>\r")), + "characters": split(extr("class='sp1'>Characters:", "</div>\r")), + "artist" : split(extr("class='sp1'>Artist:" , "</div>\r")), + "group" : split(extr('class="sp1">Circle:' , "</div>\r")), + "tags" : split(extr('class="sp1">Tags:' , "</div>\r")), + "lang" : util.language_to_code(language), + "language" : language, + } + + def images(self, page): + url = self.root + text.extract(page, 'id="d1"><a href="', '"')[0] + imgs = text.extract(self.request(url).text, 'var imgs=', ';')[0] + return [(self.root + path, None) for path in json.loads(imgs)] diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index d67c58a..20e0746 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -259,7 +259,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def _gallery_page(self): url = "{}/g/{}/{}/".format( self.root, self.gallery_id, self.gallery_token) - response = self.request(url, expect=range(400, 500)) + response = self.request(url, fatal=False) page = response.text if response.status_code == 404 and "Gallery Not Available" in page: @@ -271,7 +271,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def _image_page(self): url = "{}/s/{}/{}-{}".format( self.root, self.image_token, self.gallery_id, self.image_num) - page = self.request(url, expect=range(400, 500)).text + page = self.request(url, fatal=False).text if page.startswith(("Invalid page", "Keep trying")): raise exception.NotFoundError("image page") diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 0468c0b..c5e3d17 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -16,16 +16,15 @@ import json class ImgurExtractor(Extractor): """Base class for imgur extractors""" category = "imgur" + root = "https://imgur.com" def __init__(self, match): Extractor.__init__(self, match) self.item_id = match.group(1) self.mp4 = self.config("mp4", True) - def _get_data(self, urlpart): - response = self.request("https://imgur.com/" + urlpart, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError(self.subcategory) + def _get_data(self, path): + response = self.request(self.root + path, notfound=self.subcategory) data = text.extract(response.text, "image : ", ",\n")[0] return self._clean(json.loads(data)) @@ -102,7 +101,7 @@ class ImgurImageExtractor(ImgurExtractor): ) def items(self): - image = self._get_data(self.item_id) + image = self._get_data("/" + self.item_id) url = self._prepare(image) yield Message.Version, 1 @@ -165,13 +164,13 @@ class ImgurAlbumExtractor(ImgurExtractor): ) def items(self): - album = self._get_data("a/" + self.item_id + "/all") + album = self._get_data("/a/" + self.item_id + "/all") images = album["album_images"]["images"] del album["album_images"] if int(album["num_images"]) > len(images): - url = ("https://imgur.com/ajaxalbums/getimages/" + - self.item_id + "/hit.json") + url = "{}/ajaxalbums/getimages/{}/hit.json".format( + self.root, self.item_id) images = self.request(url).json()["data"]["images"] yield Message.Version, 1 diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 871236b..475e24b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -11,7 +11,8 @@ import hashlib import json from .common import Extractor, Message -from .. import text +from .. import text, exception +from ..cache import cache class InstagramExtractor(Extractor): @@ -21,11 +22,14 @@ class InstagramExtractor(Extractor): filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}" archive_fmt = "{media_id}" root = "https://www.instagram.com" + cookiedomain = ".instagram.com" + cookienames = ("sessionid",) def get_metadata(self): return {} def items(self): + self.login() yield Message.Version, 1 metadata = self.get_metadata() @@ -40,6 +44,46 @@ class InstagramExtractor(Extractor): yield Message.Url, \ 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data + def login(self): + if self._check_cookies(self.cookienames): + return + username, password = self._get_auth_info() + if username: + self.session.cookies.set("ig_cb", "1", domain="www.instagram.com") + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=360*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + page = self.request(self.root + "/accounts/login/").text + headers = { + "Referer" : self.root + "/accounts/login/", + "X-IG-App-ID" : "936619743392459", + "X-Requested-With": "XMLHttpRequest", + } + + response = self.request(self.root + "/web/__mid/", headers=headers) + headers["X-CSRFToken"] = response.cookies["csrftoken"] + headers["X-Instagram-AJAX"] = text.extract( + page, '"rollout_hash":"', '"')[0] + + url = self.root + "/accounts/login/ajax/" + data = { + "username" : username, + "password" : password, + "queryParams" : "{}", + "optIntoOneTap": "true", + } + response = self.request(url, method="POST", headers=headers, data=data) + + if not response.json().get("authenticated"): + raise exception.AuthenticationError() + return { + key: self.session.cookies.get(key) + for key in ("sessionid", "mid", "csrftoken") + } + def _extract_shared_data(self, page): return json.loads(text.extract(page, 'window._sharedData = ', ';</script>')[0]) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 9e0aaa3..282c389 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -48,22 +48,20 @@ class NewgroundsExtractor(Extractor): extr = text.extract_from(self.request(page_url).text) full = text.extract_from(json.loads(extr('"full_image_text":', '});'))) data = { + "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), - "date" : extr('itemprop="datePublished" content="', '"'), + "date" : text.parse_datetime(extr( + 'itemprop="datePublished" content="', '"')), "rating" : extr('class="rated-', '"'), "favorites" : text.parse_int(extr('id="faves_load">', '<')), "score" : text.parse_float(extr('id="score_number">', '<')), + "tags" : text.split_html(extr( + '<dd class="tags momag">', '</dd>')), "url" : full('src="', '"'), - "title" : text.unescape(full('alt="', '"')), "width" : text.parse_int(full('width="', '"')), "height" : text.parse_int(full('height="', '"')), } - - tags = text.split_html(extr('<dd class="tags momag">', '</dd>')) - tags.sort() - data["tags"] = tags - - data["date"] = text.parse_datetime(data["date"]) + data["tags"].sort() data["index"] = text.parse_int( data["url"].rpartition("/")[2].partition("_")[0]) return data @@ -95,7 +93,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor): test = ( ("https://blitzwuff.newgrounds.com/art", { "url": "24b19c4a135a09889fac7b46a74e427e4308d02b", - "keyword": "2aab0532a894ff3cf88dd01ce5c60f114011b268", + "keyword": "98566e0c8096a8099b8d71962fea7e31c8b098d4", }), ("https://blitzwuff.newgrounds.com/"), ) @@ -140,9 +138,9 @@ class NewgroundsVideoExtractor(NewgroundsExtractor): subcategory = "video" filename_fmt = "{category}_{index}.{extension}" pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$" - test = ("https://twistedgrim.newgrounds.com/movies", { + test = ("https://tomfulp.newgrounds.com/movies", { "pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+", - "count": ">= 29", + "count": ">= 32", }) def get_page_urls(self): diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index abf1eaa..4c48d73 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -106,13 +106,8 @@ class NijieExtractor(AsynchronousMixin, Extractor): params = {"id": self.user_id, "p": 1} while True: - response = self.request(url, params=params, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("artist") - - page = response.text - ids = list(text.extract_iter(page, ' illust_id="', '"')) - yield from ids + page = self.request(url, params=params, notfound="artist").text + yield from text.extract_iter(page, 'illust_id="', '"') if '<a rel="next"' not in page: return @@ -126,7 +121,7 @@ class NijieUserExtractor(NijieExtractor): r"/members(?:_illust)?\.php\?id=(\d+)") test = ( ("https://nijie.info/members_illust.php?id=44", { - "url": "585d821df4716b1098660a0be426d01db4b65f2a", + "url": "66c4ff94c6e77c0765dd88f2d8c663055fda573e", "keyword": "d629c69e3172db1d7e026145e8eb640ac31ac16a", }), ("https://nijie.info/members_illust.php?id=43", { @@ -174,7 +169,7 @@ class NijieImageExtractor(NijieExtractor): r"/view(?:_popup)?\.php\?id=(\d+)") test = ( ("https://nijie.info/view.php?id=70720", { - "url": "a10d4995645b5f260821e32c60a35f73546c2699", + "url": "5497f897311397dafa188521258624346a0af2a3", "keyword": "408393d010307c76d52cbd0a4368d6d357805aea", "content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6", }), @@ -190,10 +185,8 @@ class NijieImageExtractor(NijieExtractor): self.page = "" def get_job_metadata(self): - response = self.request(self.view_url + self.image_id, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("image") - self.page = response.text + self.page = self.request( + self.view_url + self.image_id, notfound="image").text self.user_id = text.extract( self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0] return NijieExtractor.get_job_metadata(self) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index fa8cd48..f5b8869 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -228,14 +228,14 @@ class PinterestAPI(): params = {"data": json.dumps({"options": options}), "source_url": ""} response = self.extractor.request( - url, params=params, headers=self.HEADERS, expect=range(400, 500)) + url, params=params, headers=self.HEADERS, fatal=False) try: data = response.json() except ValueError: data = {} - if 200 <= response.status_code < 400 and not response.history: + if response.status_code < 400 and not response.history: return data if response.status_code == 404 or response.history: diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index af29c4b..76d4dc4 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -143,9 +143,7 @@ class PixivMeExtractor(PixivExtractor): def items(self): url = "https://pixiv.me/" + self.account response = self.request( - url, method="HEAD", allow_redirects=False, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("user") + url, method="HEAD", allow_redirects=False, notfound="user") yield Message.Version, 1 yield Message.Queue, response.headers["Location"], {} @@ -445,7 +443,7 @@ class PixivAppAPI(): data["password"] = password response = self.extractor.request( - url, method="POST", data=data, expect=(400,)) + url, method="POST", data=data, fatal=False) if response.status_code >= 400: raise exception.AuthenticationError() @@ -491,10 +489,9 @@ class PixivAppAPI(): url = "https://app-api.pixiv.net/" + endpoint self.login() - response = self.extractor.request( - url, params=params, expect=range(400, 500)) + response = self.extractor.request(url, params=params, fatal=False) - if 200 <= response.status_code < 400: + if response.status_code < 400: return response.json() if response.status_code == 404: raise exception.NotFoundError() diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 0c5a924..2ba4b99 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, util, extractor, exception from ..cache import cache -import datetime import time @@ -235,8 +234,7 @@ class RedditAPI(): url = "https://oauth.reddit.com" + endpoint params["raw_json"] = 1 self.authenticate() - response = self.extractor.request( - url, params=params, expect=range(400, 500)) + response = self.extractor.request(url, params=params, fatal=False) remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: wait = int(response.headers["x-ratelimit-reset"]) @@ -252,12 +250,9 @@ class RedditAPI(): return data def _pagination(self, endpoint, params, _empty=()): - date_fmt = self.extractor.config("date-format", "%Y-%m-%dT%H:%M:%S") - date_min = self._parse_datetime("date-min", 0, date_fmt) - date_max = self._parse_datetime("date-max", 253402210800, date_fmt) - id_min = self._parse_id("id-min", 0) id_max = self._parse_id("id-max", 2147483647) + date_min, date_max = self.extractor._get_date_min_max(0, 253402210800) while True: data = self._call(endpoint, params)["data"] @@ -294,16 +289,6 @@ class RedditAPI(): if link_id and extra: yield from self.morechildren(link_id, extra) - def _parse_datetime(self, key, default, fmt): - ts = self.extractor.config(key, default) - if isinstance(ts, str): - try: - ts = int(datetime.datetime.strptime(ts, fmt).timestamp()) - except ValueError as exc: - self.log.warning("Unable to parse '%s': %s", key, exc) - ts = default - return ts - def _parse_id(self, key, default): sid = self.extractor.config(key) return self._decode(sid.rpartition("_")[2].lower()) if sid else default diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index 22b2b63..55eda9f 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -110,8 +110,8 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor): yield Message.Version, 1 while True: url = "{}/{}/page/{}/".format(self.root, self.path, pnum) - response = self.request(url, expect=(404,)) - if response.status_code == 404: + response = self.request(url, fatal=False) + if response.status_code >= 400: return for url in text.extract_iter(response.text, 'data-direct="', '"'): if url != last: diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index f63c999..0d92573 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -43,9 +43,7 @@ class SeigaExtractor(Extractor): """Get url for an image with id 'image_id'""" url = "{}/image/source/{}".format(self.root, image_id) response = self.request( - url, method="HEAD", allow_redirects=False, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("image") + url, method="HEAD", allow_redirects=False, notfound="image") return response.headers["Location"].replace("/o/", "/priv/", 1) def login(self): diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index aa2b16b..afd4eaa 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -23,9 +23,9 @@ class SexcomExtractor(Extractor): def items(self): yield Message.Version, 1 yield Message.Directory, self.metadata() - for url in self.pins(): - pin = self._parse_pin(url) - yield Message.Url, pin["url"], pin + for pin in map(self._parse_pin, self.pins()): + if pin: + yield Message.Url, pin["url"], pin def metadata(self): return {} @@ -49,8 +49,13 @@ class SexcomExtractor(Extractor): return url = text.urljoin(self.root, url) - def _parse_pin(self, pin_url): - extr = text.extract_from(self.request(pin_url).text) + def _parse_pin(self, url): + response = self.request(url, fatal=False) + if response.status_code >= 400: + self.log.warning('Unable to fetch %s ("%s: %s")', + url, response.status_code, response.reason) + return None + extr = text.extract_from(response.text) data = {} data["thumbnail"] = extr('itemprop="thumbnail" content="', '"') @@ -88,10 +93,10 @@ class SexcomExtractor(Extractor): class SexcomPinExtractor(SexcomExtractor): - """Extractor a pinned image or video on www.sex.com""" + """Extractor for a pinned image or video on www.sex.com""" subcategory = "pin" directory_fmt = ("{category}",) - pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)" + pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)(?!.*#related$)" test = ( # picture ("https://www.sex.com/pin/56714360/", { @@ -124,6 +129,10 @@ class SexcomPinExtractor(SexcomExtractor): ("https://www.sex.com/pin/55847384-very-nicely-animated/", { "pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2", }), + # 404 + ("https://www.sex.com/pin/55847385/", { + "count": 0, + }), ) def __init__(self, match): @@ -134,6 +143,25 @@ class SexcomPinExtractor(SexcomExtractor): return ("{}/pin/{}/".format(self.root, self.pin_id),) +class SexcomRelatedPinExtractor(SexcomPinExtractor): + """Extractor for related pins on www.sex.com""" + subcategory = "related-pin" + directory_fmt = ("{category}", "related {original_pin[pin_id]}") + pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+).*#related$" + test = ("https://www.sex.com/pin/56714360/#related", { + "count": 24, + }) + + def metadata(self): + pin = self._parse_pin(SexcomPinExtractor.pins(self)[0]) + return {"original_pin": pin} + + def pins(self): + url = "{}/pin/related?pinId={}&limit=24&offset=0".format( + self.root, self.pin_id) + return self._pagination(url) + + class SexcomBoardExtractor(SexcomExtractor): """Extractor for pins from a board on www.sex.com""" subcategory = "board" diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 35895bb..b2498a0 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -10,7 +10,6 @@ from .common import Extractor, Message, SharedConfigMixin, generate_extractors from .. import text -import time import re @@ -24,19 +23,9 @@ class ShopifyExtractor(SharedConfigMixin, Extractor): Extractor.__init__(self, match) self.item_url = self.root + match.group(1) - def request(self, url, method="GET", expect=range(400, 500), **kwargs): - tries = 0 - kwargs["expect"] = expect - while True: - response = Extractor.request(self, url, method, **kwargs) - if response.status_code not in (429, 430): - return response - tries += 1 - waittime = 2 ** (tries + 2) - self.log.warning( - "HTTP status %s: %s - Waiting for %d seconds", - response.status_code, response.reason, waittime) - time.sleep(waittime) + def request(self, url, **kwargs): + kwargs["retries"] = float("inf") + return Extractor.request(self, url, **kwargs) def items(self): data = self.metadata() @@ -45,9 +34,10 @@ class ShopifyExtractor(SharedConfigMixin, Extractor): headers = {"X-Requested-With": "XMLHttpRequest"} for url in self.products(): - response = self.request(url + ".json", headers=headers) + response = self.request( + url + ".json", headers=headers, fatal=False) if response.status_code >= 400: - self.log.warning('Skipping %s ("%d: %s")', + self.log.warning('Skipping %s ("%s: %s")', url, response.status_code, response.reason) continue product = response.json()["product"] @@ -89,10 +79,14 @@ class ShopifyCollectionExtractor(ShopifyExtractor): while True: page = self.request(self.item_url, params=params).text urls = search_re.findall(page) + last = None if not urls: return for path in urls: + if last == path: + continue + last = path yield self.root + path params["page"] += 1 @@ -113,7 +107,7 @@ EXTRACTORS = { "pattern": r"(?:www\.)?fashionnova\.com", "test-product": ( ("https://www.fashionnova.com/products/essential-slide-red", { - "pattern": r"https?://cdn\.shopify.com/", + "pattern": r"https?://cdn\d*\.shopify.com/", "count": 3, }), ("https://www.fashionnova.com/collections/flats/products/name"), diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 44dc6fe..5ad372d 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { "url": "258289249990502c3138719cb89e995a60861e49", - "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b", + "keyword": "eba83ccdbab3022a2280c77aa747f9458196138b", }), ("https://www.simply-hentai.com/notfound", { "exception": exception.GalleryDLException, @@ -40,30 +40,26 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): self.session.headers["Referer"] = url def metadata(self, page): - extr = text.extract - title , pos = extr(page, '<meta property="og:title" content="', '"') + extr = text.extract_from(page) + split = text.split_html + + title = extr('<meta property="og:title" content="', '"') if not title: raise exception.NotFoundError("gallery") - gid , pos = extr(page, '/Album/', '/', pos) - series, pos = extr(page, 'box-title">Series</div>', '</div>', pos) - lang , pos = extr(page, 'box-title">Language</div>', '</div>', pos) - chars , pos = extr(page, 'box-title">Characters</div>', '</div>', pos) - tags , pos = extr(page, 'box-title">Tags</div>', '</div>', pos) - artist, pos = extr(page, 'box-title">Artists</div>', '</div>', pos) - date , pos = extr(page, 'Uploaded', '</div>', pos) - lang = text.remove_html(lang) if lang else None - - return { - "gallery_id": text.parse_int(gid), + data = { "title" : text.unescape(title), - "artist" : text.split_html(artist), - "parody" : text.split_html(series), - "characters": text.split_html(chars), - "tags" : text.split_html(tags), - "lang" : util.language_to_code(lang), - "language" : lang, - "date" : text.remove_html(date), + "gallery_id": text.parse_int(extr('/Album/', '/')), + "parody" : split(extr('box-title">Series</div>', '</div>')), + "language" : text.remove_html(extr( + 'box-title">Language</div>', '</div>')) or None, + "characters": split(extr('box-title">Characters</div>', '</div>')), + "tags" : split(extr('box-title">Tags</div>', '</div>')), + "artist" : split(extr('box-title">Artists</div>', '</div>')), + "date" : text.parse_datetime(text.remove_html( + extr('Uploaded', '</div>')), "%d.%m.%Y"), } + data["lang"] = util.language_to_code(data["language"]) + return data def images(self, _): url = self.chapter_url + "/all-pages" @@ -102,12 +98,11 @@ class SimplyhentaiImageExtractor(Extractor): self.type = match.group(2) def items(self): - page = self.request(self.page_url).text - url_search = 'data-src="' if self.type == "image" else '<source src="' - - title, pos = text.extract(page, '"og:title" content="', '"') - descr, pos = text.extract(page, '"og:description" content="', '"', pos) - url , pos = text.extract(page, url_search, '"', pos) + extr = text.extract_from(self.request(self.page_url).text) + title = extr('"og:title" content="' , '"') + descr = extr('"og:description" content="', '"') + url = extr('"image":"' , '&') + url = extr(""content":"", "&") or url tags = text.extract(descr, " tagged with ", " online for free ")[0] if tags: @@ -140,13 +135,13 @@ class SimplyhentaiVideoExtractor(Extractor): ("https://videos.simply-hentai.com/creamy-pie-episode-02", { "pattern": r"https://www\.googleapis\.com/drive/v3/files" r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+", - "keyword": "29d63987fed33f0a9f4b3786d1d71b03d793250a", + "keyword": "706790708b14773efc1e075ddd3b738a375348a5", "count": 1, }), (("https://videos.simply-hentai.com" "/1715-tifa-in-hentai-gang-bang-3d-movie"), { "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0", - "keyword": "c561341aa3c6999f615abf1971d28fb2a83da2a7", + "keyword": "f9dad94fbde9c95859e631ff4f07297a9567b874", }), ) @@ -178,8 +173,9 @@ class SimplyhentaiVideoExtractor(Extractor): "title": text.unescape(title), "episode": text.parse_int(episode), "tags": text.split_html(tags)[::2], - "date": text.remove_html(date), "type": "video", + "date": text.parse_datetime(text.remove_html( + date), "%B %d, %Y %H:%M"), }) yield Message.Version, 1 diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 80348ae..2e6508c 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -69,11 +69,11 @@ class SmugmugAlbumExtractor(SmugmugExtractor): archive_fmt = "a_{Album[AlbumKey]}_{Image[ImageKey]}" pattern = r"smugmug:album:([^:]+)$" test = ( - ("smugmug:album:ddvxpg", { - "url": "0429e9bf50ee600674e448934e3882ca1761ae7b", + ("smugmug:album:cr4C7f", { + "url": "1436ee98d5797b308ecce5862e4885944f59c03c", }), # empty - ("smugmug:album:SXvjbW", { + ("smugmug:album:Fb7hMs", { "count": 0, }), # no "User" @@ -109,10 +109,10 @@ class SmugmugImageExtractor(SmugmugExtractor): archive_fmt = "{Image[ImageKey]}" pattern = BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#-]+)" test = ( - ("https://acapella.smugmug.com/Micro-Macro/Drops/i-g2Dmf9z", { - "url": "78f0bf3516b6d670b7319216bdeccb35942ca4cf", - "keyword": "b298ef7ed2b1918263b6a7dc6f56e54401584381", - "content": "64a8f69a1d824921eebbdf2420087937adfa45cd", + ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { + "url": "f624ad7293afd6412a7d34e3950a118596c36c85", + "keyword": "ea70e93be5067dca988d871dcf9afac491a189a4", + "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", }), # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { @@ -142,12 +142,12 @@ class SmugmugPathExtractor(SmugmugExtractor): subcategory = "path" pattern = BASE_PATTERN + r"((?:/[^/?&#a-fh-mo-z][^/?&#]*)*)/?$" test = ( - ("https://acapella.smugmug.com/Micro-Macro/Drops/", { - "pattern": "smugmug:album:ddvxpg$", + ("https://tdm.smugmug.com/Nature/Dove", { + "pattern": "smugmug:album:cr4C7f$", }), - ("https://acapella.smugmug.com/", { + ("https://tdm.smugmug.com/", { "pattern": SmugmugAlbumExtractor.pattern, - "url": "797eb1cbbf5ad8ecac8ee4eedc6466ed77a65d68", + "url": "1640028712875b90974e5aecd91b60e6de6138c7", }), # gallery node without owner ("https://www.smugmug.com/gallery/n-GLCjnD/", { diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index 62a9173..03ee144 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -107,9 +107,9 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): def images(self, page): url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id) headers = {"Referer": self.chapter_url} - response = self.request(url, headers=headers, expect=(404,)) + response = self.request(url, headers=headers, fatal=False) - if response.status_code == 404: + if response.status_code >= 400: url = "{}/Read/View/{}".format(self.root, self.gallery_id) self.log.error( "Failed to get gallery JSON data. Visit '%s' in a browser " diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 5679cdc..024d6e9 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -65,11 +65,15 @@ class TumblrExtractor(Extractor): if self.reblogs == "same-blog": self._skip_reblog = self._skip_reblog_same_blog + self.date_min, self.api.before = self._get_date_min_max(0, None) + def items(self): blog = None yield Message.Version, 1 for post in self.posts(): + if self.date_min > post["timestamp"]: + return if post["type"] not in self.types: continue if not blog: @@ -207,7 +211,7 @@ class TumblrUserExtractor(TumblrExtractor): ("http://demo.tumblr.com/", { "pattern": (r"https?://(?:$|" r"\d+\.media\.tumblr\.com/.+_1280\.jpg|" - r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"), + r"a\.tumblr\.com/tumblr_\w+)"), "count": 3, "options": (("posts", "all"), ("external", True)) }), @@ -223,6 +227,11 @@ class TumblrUserExtractor(TumblrExtractor): "count": 2, "keyword": {"tags": ["test", "private", "hidden"]}, }), + ("https://mikf123.tumblr.com/", { # date-min/-max/-format (#337) + "count": 4, + "options": (("date-min", "201804"), ("date-max", "201805"), + ("date-format", "%Y%m")) + }), ("https://demo.tumblr.com/page/2"), ("https://demo.tumblr.com/archive"), ("tumblr:http://www.b-authentique.com/"), @@ -280,6 +289,7 @@ class TumblrPostExtractor(TumblrExtractor): TumblrExtractor.__init__(self, match) self.post_id = match.group(3) self.reblogs = True + self.date_min = 0 def posts(self): return self.api.posts(self.blog, {"id": self.post_id}) @@ -328,7 +338,7 @@ class TumblrAPI(oauth.OAuth1API): def __init__(self, extractor): oauth.OAuth1API.__init__(self, extractor) - self.posts_type = None + self.posts_type = self.before = None def info(self, blog): """Return general information about a blog""" @@ -350,6 +360,8 @@ class TumblrAPI(oauth.OAuth1API): params.update({"offset": 0, "limit": 50, "reblog_info": "true"}) if self.posts_type: params["type"] = self.posts_type + if self.before: + params["before"] = self.before while True: data = self._call(blog, "posts", params) self.BLOG_CACHE[blog] = data["blog"] @@ -360,7 +372,7 @@ class TumblrAPI(oauth.OAuth1API): def likes(self, blog): """Retrieve liked posts""" - params = {"limit": 50} + params = {"limit": "50", "before": self.before} while True: posts = self._call(blog, "likes", params)["liked_posts"] if not posts: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ad4dc46..ccba640 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache +import re class TwitterExtractor(Extractor): @@ -26,8 +27,13 @@ class TwitterExtractor(Extractor): Extractor.__init__(self, match) self.user = match.group(1) self.retweets = self.config("retweets", True) + self.content = self.config("content", False) self.videos = self.config("videos", False) + if self.content: + self._emoji_sub = re.compile( + r'<img class="Emoji [^>]+ alt="([^"]+)"[^>]*>').sub + def items(self): self.login() yield Message.Version, 1 @@ -35,6 +41,7 @@ class TwitterExtractor(Extractor): for tweet in self.tweets(): data = self._data_from_tweet(tweet) + if not self.retweets and data["retweet_id"]: continue @@ -87,10 +94,9 @@ class TwitterExtractor(Extractor): raise exception.AuthenticationError() return self.session.cookies - @staticmethod - def _data_from_tweet(tweet): + def _data_from_tweet(self, tweet): extr = text.extract_from(tweet) - return { + data = { "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), "retweeter" : extr('data-retweeter="' , '"'), @@ -99,6 +105,14 @@ class TwitterExtractor(Extractor): "user_id" : text.parse_int(extr('data-user-id="' , '"')), "date" : text.parse_timestamp(extr('data-time="', '"')), } + if self.content: + content = extr('<div class="js-tweet-text-container">', '\n</div>') + if '<img class="Emoji ' in content: + content = self._emoji_sub(r"\1", content) + content = text.unescape(text.remove_html(content, "", "")) + cl, _, cr = content.rpartition("pic.twitter.com/") + data["content"] = cl if cl and len(cr) < 16 else content + return data def _tweets_from_api(self, url): params = { @@ -186,6 +200,11 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("videos", True),), "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+", }), + # content with emoji, newlines, hashtags (#338) + ("https://twitter.com/yumi_san0112/status/1151144618936823808", { + "options": (("content", True),), + "keyword": "b13b6c4cd0b0c15b2ea7685479e7fedde3c47b9e", + }), ) def __init__(self, match): @@ -199,4 +218,4 @@ class TwitterTweetExtractor(TwitterExtractor): url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id) page = self.request(url).text return (text.extract( - page, '<div class="tweet ', '<ul class="stats')[0],) + page, '<div class="tweet ', 'class="js-tweet-stats-container')[0],) diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 7eec18b..e253b7f 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -18,12 +18,6 @@ class XvideosExtractor(Extractor): category = "xvideos" root = "https://www.xvideos.com" - def get_page(self, url, codes=(403, 404)): - response = self.request(url, expect=codes) - if response.status_code in codes: - raise exception.NotFoundError(self.subcategory) - return response.text - class XvideosGalleryExtractor(XvideosExtractor): """Extractor for user profile galleries from xvideos.com""" @@ -37,7 +31,7 @@ class XvideosGalleryExtractor(XvideosExtractor): (("https://www.xvideos.com/profiles" "/pervertedcouple/photos/751031/random_stuff"), { "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7", - "keyword": "8d637b372c6231cc4ada92dd5918db5fdbd06520", + "keyword": "65979d63a69576cf692b41d5fbbd995cc40a51b9", }), ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", { "exception": exception.NotFoundError, @@ -50,7 +44,7 @@ class XvideosGalleryExtractor(XvideosExtractor): def items(self): url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid) - page = self.get_page(url) + page = self.request(url, notfound=self.subcategory).text data = self.get_metadata(page) imgs = self.get_images(page) data["count"] = len(imgs) @@ -113,7 +107,7 @@ class XvideosUserExtractor(XvideosExtractor): def items(self): url = "{}/profiles/{}".format(self.root, self.user) - page = self.get_page(url) + page = self.request(url, notfound=self.subcategory).text data = json.loads(text.extract( page, "xv.conf=", ";</script>")[0])["data"] |
