summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/35photo.py17
-rw-r--r--gallery_dl/extractor/500px.py38
-rw-r--r--gallery_dl/extractor/__init__.py1
-rw-r--r--gallery_dl/extractor/artstation.py4
-rw-r--r--gallery_dl/extractor/common.py117
-rw-r--r--gallery_dl/extractor/deviantart.py17
-rw-r--r--gallery_dl/extractor/erolord.py64
-rw-r--r--gallery_dl/extractor/exhentai.py4
-rw-r--r--gallery_dl/extractor/imgur.py15
-rw-r--r--gallery_dl/extractor/instagram.py46
-rw-r--r--gallery_dl/extractor/newgrounds.py20
-rw-r--r--gallery_dl/extractor/nijie.py19
-rw-r--r--gallery_dl/extractor/pinterest.py4
-rw-r--r--gallery_dl/extractor/pixiv.py11
-rw-r--r--gallery_dl/extractor/reddit.py19
-rw-r--r--gallery_dl/extractor/sankakucomplex.py4
-rw-r--r--gallery_dl/extractor/seiga.py4
-rw-r--r--gallery_dl/extractor/sexcom.py42
-rw-r--r--gallery_dl/extractor/shopify.py28
-rw-r--r--gallery_dl/extractor/simplyhentai.py56
-rw-r--r--gallery_dl/extractor/smugmug.py22
-rw-r--r--gallery_dl/extractor/tsumino.py4
-rw-r--r--gallery_dl/extractor/tumblr.py18
-rw-r--r--gallery_dl/extractor/twitter.py27
-rw-r--r--gallery_dl/extractor/xvideos.py12
25 files changed, 380 insertions, 233 deletions
diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py
index 50dbfe8..d3e9276 100644
--- a/gallery_dl/extractor/35photo.py
+++ b/gallery_dl/extractor/35photo.py
@@ -72,7 +72,6 @@ class _35photoExtractor(Extractor):
"user" : data["user_login"],
"user_id" : data["user_id"],
"user_name" : data["user_name"],
- "other" : data["otherData"],
}
if "series" in data:
@@ -89,6 +88,8 @@ class _35photoExtractor(Extractor):
def _photo_ids(page):
"""Extract unique photo IDs and return them as sorted list"""
# searching for photo-id="..." doesn't always work (see unit tests)
+ if not page:
+ return ()
return sorted(
set(text.extract_iter(page, "/photo_", "/")),
key=text.parse_int,
@@ -100,7 +101,7 @@ class _35photoUserExtractor(_35photoExtractor):
"""Extractor for all images of a user on 35photo.pro"""
subcategory = "user"
pattern = (r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro"
- r"/(?!photo_|genre_)([^/?&#]+)")
+ r"/(?!photo_|genre_|rating/)([^/?&#]+)")
test = (
("https://35photo.pro/liya", {
"pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg",
@@ -146,7 +147,14 @@ class _35photoGenreExtractor(_35photoExtractor):
("https://35photo.pro/genre_109/", {
"range": "1-30",
}),
- ("https://35photo.pro/genre_109/new/"),
+ ("https://35photo.pro/genre_103/", {
+ "range": "1-30",
+ "count": 30,
+ }),
+ ("https://35photo.pro/genre_103/new/", {
+ "range": "1-30",
+ "count": 30,
+ }),
)
def __init__(self, match):
@@ -165,6 +173,8 @@ class _35photoGenreExtractor(_35photoExtractor):
}
def photos(self):
+ if not self.photo_ids:
+ return ()
return self._pagination({
"page": "genre",
"community_id": self.genre_id,
@@ -193,7 +203,6 @@ class _35photoImageExtractor(_35photoExtractor):
"user" : "liya",
"user_id" : 20415,
"user_name" : "Liya Mirzaeva",
- "other" : str,
},
})
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
index 00b8ab5..07c2e14 100644
--- a/gallery_dl/extractor/500px.py
+++ b/gallery_dl/extractor/500px.py
@@ -31,8 +31,7 @@ class _500pxExtractor(Extractor):
for photo in self.photos():
url = photo["images"][-1]["url"]
- fmt = photo["image_format"]
- photo["extension"] = "jpg" if fmt == "jpeg" else fmt
+ photo["extension"] = photo["image_format"]
if data:
photo.update(data)
if first:
@@ -59,7 +58,7 @@ class _500pxExtractor(Extractor):
"include_releases" : "true",
"liked_by" : "1",
"following_sample" : "100",
- "image_size" : "32768",
+ "image_size" : "4096",
"ids" : ",".join(str(p["id"]) for p in photos),
}
@@ -90,7 +89,7 @@ class _500pxUserExtractor(_500pxExtractor):
pattern = (r"(?:https?://)?500px\.com"
r"/(?!photo/)([^/?&#]+)/?(?:$|\?|#)")
test = ("https://500px.com/light_expression_photography", {
- "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D5000/v2",
+ "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D4096/v2",
"range": "1-99",
"count": 99,
})
@@ -124,7 +123,7 @@ class _500pxGalleryExtractor(_500pxExtractor):
pattern = (r"(?:https?://)?500px\.com"
r"/(?!photo/)([^/?&#]+)/galleries/([^/?&#]+)")
test = ("https://500px.com/fashvamp/galleries/lera", {
- "url": "8a520272ece83278166b4f8556f9c9da43c43c45",
+ "url": "002dc81dee5b4a655f0e31ad8349e8903b296df6",
"count": 3,
"keyword": {
"gallery": dict,
@@ -144,7 +143,7 @@ class _500pxGalleryExtractor(_500pxExtractor):
page = self.request(url).text
self.csrf_token, pos = text.extract(page, 'csrf-token" content="', '"')
self.user_id , pos = text.extract(page, 'App.CuratorId =', '\n', pos)
- self.user_id = self.user_id.strip()
+ self.user_id = self.user_id.strip(" '\";")
# get gallery metadata; transform gallery name into id
url = "https://api.500px.com/v1/users/{}/galleries/{}".format(
@@ -174,37 +173,30 @@ class _500pxImageExtractor(_500pxExtractor):
subcategory = "image"
pattern = r"(?:https?://)?500px\.com/photo/(\d+)"
test = ("https://500px.com/photo/222049255/queen-of-coasts", {
- "url": "d1eda7afeaa589f71f05b9bb5c0694e3ffb357cd",
+ "url": "fbdf7df39325cae02f5688e9f92935b0e7113315",
"count": 1,
"keyword": {
"camera": "Canon EOS 600D",
"camera_info": dict,
- "collections_count": int,
"comments": list,
"comments_count": int,
- "converted": False,
- "converted_bits": int,
- "created_at": "2017-08-01T04:40:05-04:00",
- "crop_version": 0,
+ "created_at": "2017-08-01T08:40:05+00:00",
"description": str,
- "editored_by": dict,
+ "editored_by": None,
"editors_choice": False,
"extension": "jpg",
- "favorites_count": int,
"feature": "popular",
"feature_date": "2017-08-01T09:58:28+00:00",
"focal_length": "208",
"height": 3111,
"id": 222049255,
- "image_format": "jpeg",
- "image_url": str,
+ "image_format": "jpg",
+ "image_url": list,
"images": list,
"iso": "100",
"lens": "EF-S55-250mm f/4-5.6 IS II",
"lens_info": dict,
- "license_type": 0,
- "licensed_at": None,
- "liked": False,
+ "liked": None,
"location": None,
"location_details": dict,
"name": "Queen Of Coasts",
@@ -212,15 +204,11 @@ class _500pxImageExtractor(_500pxExtractor):
"privacy": False,
"profile": True,
"rating": float,
- "sales_count": int,
"status": 1,
- "store_download": False,
- "store_height": 3111,
- "store_width": 4637,
"tags": list,
- "taken_at": "2017-05-04T13:36:51-04:00",
+ "taken_at": "2017-05-04T17:36:51+00:00",
"times_viewed": int,
- "url": "/photo/222049255/queen-of-coasts-by-olesya-nabieva",
+ "url": "/photo/222049255/Queen-Of-Coasts-by-Olesya-Nabieva",
"user": dict,
"user_id": 12847235,
"votes_count": int,
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 81d480e..189c163 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -24,6 +24,7 @@ modules = [
"deviantart",
"dynastyscans",
"e621",
+ "erolord",
"exhentai",
"fallenangels",
"flickr",
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index 24197ad..f7b3bc1 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -79,9 +79,7 @@ class ArtstationExtractor(Extractor):
def get_user_info(self, username):
"""Return metadata for a specific user"""
url = "{}/users/{}/quick.json".format(self.root, username.lower())
- response = self.request(url, expect=(404,))
- if response.status_code == 404:
- raise exception.NotFoundError("user")
+ response = self.request(url, notfound="user")
return response.json()
def _pagination(self, url, params=None):
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 175af63..5c40e2a 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -13,6 +13,7 @@ import time
import netrc
import queue
import logging
+import datetime
import requests
import threading
import http.cookiejar
@@ -39,10 +40,13 @@ class Extractor():
self._init_headers()
self._init_cookies()
self._init_proxies()
- self._retries = self.config("retries", 5)
+ self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
+ if self._retries < 0:
+ self._retries = float("inf")
+
@classmethod
def from_url(cls, url):
if isinstance(cls.pattern, str):
@@ -63,11 +67,11 @@ class Extractor():
return config.interpolate(
("extractor", self.category, self.subcategory, key), default)
- def request(self, url, method="GET", *, session=None,
- encoding=None, expect=(), retries=None, **kwargs):
- tries = 0
- retries = retries or self._retries
- session = session or self.session
+ def request(self, url, method="GET", *, session=None, retries=None,
+ encoding=None, fatal=True, notfound=None, **kwargs):
+ tries = 1
+ retries = self._retries if retries is None else retries
+ session = self.session if session is None else session
kwargs.setdefault("timeout", self._timeout)
kwargs.setdefault("verify", self._verify)
@@ -83,26 +87,37 @@ class Extractor():
raise exception.HttpError(exc)
else:
code = response.status_code
- if 200 <= code < 400 or code in expect:
+ if 200 <= code < 400 or not fatal and \
+ (400 <= code < 429 or 431 <= code < 500):
if encoding:
response.encoding = encoding
return response
+ if notfound and code == 404:
+ raise exception.NotFoundError(notfound)
if cloudflare.is_challenge(response):
self.log.info("Solving Cloudflare challenge")
url, domain, cookies = cloudflare.solve_challenge(
session, response, kwargs)
cloudflare.cookies.update(self.category, (domain, cookies))
continue
+ if cloudflare.is_captcha(response):
+ try:
+ import OpenSSL # noqa
+ except ImportError:
+ msg = " - Install 'pyOpenSSL' and try again"
+ else:
+ msg = ""
+ self.log.warning("Cloudflare CAPTCHA" + msg)
msg = "{}: {} for url: {}".format(code, response.reason, url)
- if code < 500 and code != 429:
+ if code < 500 and code != 429 and code != 430:
break
- tries += 1
- self.log.debug("%s (%d/%d)", msg, tries, retries)
- if tries >= retries:
+ self.log.debug("%s (%s/%s)", msg, tries, retries+1)
+ if tries > retries:
break
- time.sleep(2 ** tries)
+ time.sleep(min(2 ** (tries-1), 1800))
+ tries += 1
raise exception.HttpError(msg)
@@ -130,8 +145,8 @@ class Extractor():
headers.clear()
headers["User-Agent"] = self.config(
- "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) "
- "Gecko/20100101 Firefox/62.0"))
+ "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) "
+ "Gecko/20100101 Firefox/68.0"))
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
headers["Accept-Encoding"] = "gzip, deflate"
@@ -203,6 +218,20 @@ class Extractor():
return False
return True
+ def _get_date_min_max(self, dmin=None, dmax=None):
+ """Retrieve and parse 'date-min' and 'date-max' config values"""
+ def get(key, default):
+ ts = self.config(key, default)
+ if isinstance(ts, str):
+ try:
+ ts = int(datetime.datetime.strptime(ts, fmt).timestamp())
+ except ValueError as exc:
+ self.log.warning("Unable to parse '%s': %s", key, exc)
+ ts = default
+ return ts
+ fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S")
+ return get("date-min", dmin), get("date-max", dmax)
+
@classmethod
def _get_tests(cls):
"""Yield an extractor's test cases as (URL, RESULTS) tuples"""
@@ -403,30 +432,36 @@ def generate_extractors(extractor_data, symtable, classes):
http.cookiejar.MozillaCookieJar.magic_re = re.compile(
"#( Netscape)? HTTP Cookie File", re.IGNORECASE)
-# Update default cipher list of urllib3
-# to fix issues with Cloudflare and, by extension, Artstation (#227)
-from requests.packages.urllib3.util import ssl_ # noqa
-logging.getLogger("gallery-dl").debug("updating default urllib3 ciphers")
-
-# cipher list taken from urllib3 1.25
-# https://github.com/urllib3/urllib3/blob/1.25/src/urllib3/util/ssl_.py
-# with additions from
-# https://github.com/Anorov/cloudflare-scrape/pull/242
-ssl_.DEFAULT_CIPHERS = (
- "ECDHE+AESGCM:"
- "ECDHE+CHACHA20:"
- "DHE+AESGCM:"
- "DHE+CHACHA20:"
- "ECDH+AESGCM:"
- "DH+AESGCM:"
- "ECDH+AES:"
- "DH+AES:"
- "RSA+AESGCM:"
- "RSA+AES:"
- "!ECDHE+SHA:"
- "!AES128-SHA:"
- "!aNULL:"
- "!eNULL:"
- "!MD5:"
- "!DSS"
-)
+# Replace default cipher list of urllib3 to avoid Cloudflare CAPTCHAs
+ciphers = config.get(("ciphers",), True)
+if ciphers:
+ logging.getLogger("gallery-dl").debug("Updating urllib3 ciphers")
+
+ if ciphers is True:
+ ciphers = (
+ # Firefox's list
+ "TLS_AES_128_GCM_SHA256:"
+ "TLS_CHACHA20_POLY1305_SHA256:"
+ "TLS_AES_256_GCM_SHA384:"
+ "ECDHE-ECDSA-AES128-GCM-SHA256:"
+ "ECDHE-RSA-AES128-GCM-SHA256:"
+ "ECDHE-ECDSA-CHACHA20-POLY1305:"
+ "ECDHE-RSA-CHACHA20-POLY1305:"
+ "ECDHE-ECDSA-AES256-GCM-SHA384:"
+ "ECDHE-RSA-AES256-GCM-SHA384:"
+ "ECDHE-ECDSA-AES256-SHA:"
+ "ECDHE-ECDSA-AES128-SHA:"
+ "ECDHE-RSA-AES128-SHA:"
+ "ECDHE-RSA-AES256-SHA:"
+ "DHE-RSA-AES128-SHA:"
+ "DHE-RSA-AES256-SHA:"
+ "AES128-SHA:"
+ "AES256-SHA:"
+ "DES-CBC3-SHA"
+ )
+ elif isinstance(ciphers, list):
+ ciphers = ":".join(ciphers)
+
+ from requests.packages.urllib3.util import ssl_ # noqa
+ ssl_.DEFAULT_CIPHERS = ciphers
+ del ssl_
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index ebab040..63e2913 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -205,8 +205,7 @@ class DeviantartExtractor(Extractor):
@staticmethod
def _find_folder(folders, name):
- pattern = re.compile(
- r"[^\w]*" + name.replace("-", r"[^\w]+") + r"[^\w]*$")
+ pattern = re.compile(r"(?i)\W*" + name.replace("-", r"\W+") + r"\W*$")
for folder in folders:
if pattern.match(folder["name"]):
return folder
@@ -416,7 +415,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
def deviations(self):
url = "{}/{}/{}".format(self.root, self.user, self.path)
- response = self._html_request(url, expect=range(400, 500))
+ response = self._html_request(url, fatal=False)
deviation_id = text.extract(response.text, '//deviation/', '"')[0]
if response.status_code >= 400 or not deviation_id:
raise exception.NotFoundError("image")
@@ -767,7 +766,7 @@ class DeviantartAPI():
def user_profile(self, username):
"""Get user profile information"""
endpoint = "user/profile/" + username
- return self._call(endpoint, expect_error=True)
+ return self._call(endpoint, fatal=False)
def authenticate(self, refresh_token):
"""Authenticate the application by requesting an access token"""
@@ -797,7 +796,7 @@ class DeviantartAPI():
_refresh_token_cache.update(refresh_token, data["refresh_token"])
return "Bearer " + data["access_token"]
- def _call(self, endpoint, params=None, expect_error=False, public=True):
+ def _call(self, endpoint, params=None, fatal=True, public=True):
"""Call an API endpoint"""
url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint
while True:
@@ -806,11 +805,7 @@ class DeviantartAPI():
self.authenticate(None if public else self.refresh_token)
response = self.extractor.request(
- url,
- params=params,
- headers=self.headers,
- expect=range(400, 500),
- )
+ url, headers=self.headers, params=params, fatal=False)
data = response.json()
status = response.status_code
@@ -818,7 +813,7 @@ class DeviantartAPI():
if self.delay > self.delay_min:
self.delay -= 1
return data
- if expect_error:
+ if not fatal:
return None
if data.get("error_description") == "User not found.":
raise exception.NotFoundError("user or group")
diff --git a/gallery_dl/extractor/erolord.py b/gallery_dl/extractor/erolord.py
new file mode 100644
index 0000000..8628039
--- /dev/null
+++ b/gallery_dl/extractor/erolord.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from http://erolord.com/"""
+
+from .common import GalleryExtractor
+from .. import text, util
+import json
+
+
+class ErolordGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from erolord.com"""
+ category = "erolord"
+ root = "http://erolord.com"
+ pattern = r"(?:https?://)?(?:www\.)?erolord.com(/doujin/(\d+)/?)"
+ test = ("http://erolord.com/doujin/2189055/", {
+ "url": "7ce6d10a3934102b95c9718a34ccd3d35f55d85f",
+ "keyword": {
+ "title" : "Amazon No Hiyaku | Amazon Elixir",
+ "gallery_id": 2189055,
+ "count" : 16,
+ "artist" : ["Morris"],
+ "group" : list,
+ "parody" : list,
+ "characters": list,
+ "tags" : list,
+ "lang" : "en",
+ "language" : "English",
+ },
+ })
+
+ def __init__(self, match):
+ GalleryExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ split = text.split_html
+ title, _, language = extr('<h1 class="t64">', '</h1>').rpartition(" ")
+ language = language.strip("[]")
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : text.unescape(title),
+ # double quotes for anime, circle, tags
+ # single quotes for characters, artist
+ "parody" : split(extr('class="sp1">Anime:' , "</div>\r")),
+ "characters": split(extr("class='sp1'>Characters:", "</div>\r")),
+ "artist" : split(extr("class='sp1'>Artist:" , "</div>\r")),
+ "group" : split(extr('class="sp1">Circle:' , "</div>\r")),
+ "tags" : split(extr('class="sp1">Tags:' , "</div>\r")),
+ "lang" : util.language_to_code(language),
+ "language" : language,
+ }
+
+ def images(self, page):
+ url = self.root + text.extract(page, 'id="d1"><a href="', '"')[0]
+ imgs = text.extract(self.request(url).text, 'var imgs=', ';')[0]
+ return [(self.root + path, None) for path in json.loads(imgs)]
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index d67c58a..20e0746 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -259,7 +259,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def _gallery_page(self):
url = "{}/g/{}/{}/".format(
self.root, self.gallery_id, self.gallery_token)
- response = self.request(url, expect=range(400, 500))
+ response = self.request(url, fatal=False)
page = response.text
if response.status_code == 404 and "Gallery Not Available" in page:
@@ -271,7 +271,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def _image_page(self):
url = "{}/s/{}/{}-{}".format(
self.root, self.image_token, self.gallery_id, self.image_num)
- page = self.request(url, expect=range(400, 500)).text
+ page = self.request(url, fatal=False).text
if page.startswith(("Invalid page", "Keep trying")):
raise exception.NotFoundError("image page")
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 0468c0b..c5e3d17 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -16,16 +16,15 @@ import json
class ImgurExtractor(Extractor):
"""Base class for imgur extractors"""
category = "imgur"
+ root = "https://imgur.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.item_id = match.group(1)
self.mp4 = self.config("mp4", True)
- def _get_data(self, urlpart):
- response = self.request("https://imgur.com/" + urlpart, expect=(404,))
- if response.status_code == 404:
- raise exception.NotFoundError(self.subcategory)
+ def _get_data(self, path):
+ response = self.request(self.root + path, notfound=self.subcategory)
data = text.extract(response.text, "image : ", ",\n")[0]
return self._clean(json.loads(data))
@@ -102,7 +101,7 @@ class ImgurImageExtractor(ImgurExtractor):
)
def items(self):
- image = self._get_data(self.item_id)
+ image = self._get_data("/" + self.item_id)
url = self._prepare(image)
yield Message.Version, 1
@@ -165,13 +164,13 @@ class ImgurAlbumExtractor(ImgurExtractor):
)
def items(self):
- album = self._get_data("a/" + self.item_id + "/all")
+ album = self._get_data("/a/" + self.item_id + "/all")
images = album["album_images"]["images"]
del album["album_images"]
if int(album["num_images"]) > len(images):
- url = ("https://imgur.com/ajaxalbums/getimages/" +
- self.item_id + "/hit.json")
+ url = "{}/ajaxalbums/getimages/{}/hit.json".format(
+ self.root, self.item_id)
images = self.request(url).json()["data"]["images"]
yield Message.Version, 1
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 871236b..475e24b 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -11,7 +11,8 @@
import hashlib
import json
from .common import Extractor, Message
-from .. import text
+from .. import text, exception
+from ..cache import cache
class InstagramExtractor(Extractor):
@@ -21,11 +22,14 @@ class InstagramExtractor(Extractor):
filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}"
archive_fmt = "{media_id}"
root = "https://www.instagram.com"
+ cookiedomain = ".instagram.com"
+ cookienames = ("sessionid",)
def get_metadata(self):
return {}
def items(self):
+ self.login()
yield Message.Version, 1
metadata = self.get_metadata()
@@ -40,6 +44,46 @@ class InstagramExtractor(Extractor):
yield Message.Url, \
'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
+ def login(self):
+ if self._check_cookies(self.cookienames):
+ return
+ username, password = self._get_auth_info()
+ if username:
+ self.session.cookies.set("ig_cb", "1", domain="www.instagram.com")
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=360*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ page = self.request(self.root + "/accounts/login/").text
+ headers = {
+ "Referer" : self.root + "/accounts/login/",
+ "X-IG-App-ID" : "936619743392459",
+ "X-Requested-With": "XMLHttpRequest",
+ }
+
+ response = self.request(self.root + "/web/__mid/", headers=headers)
+ headers["X-CSRFToken"] = response.cookies["csrftoken"]
+ headers["X-Instagram-AJAX"] = text.extract(
+ page, '"rollout_hash":"', '"')[0]
+
+ url = self.root + "/accounts/login/ajax/"
+ data = {
+ "username" : username,
+ "password" : password,
+ "queryParams" : "{}",
+ "optIntoOneTap": "true",
+ }
+ response = self.request(url, method="POST", headers=headers, data=data)
+
+ if not response.json().get("authenticated"):
+ raise exception.AuthenticationError()
+ return {
+ key: self.session.cookies.get(key)
+ for key in ("sessionid", "mid", "csrftoken")
+ }
+
def _extract_shared_data(self, page):
return json.loads(text.extract(page,
'window._sharedData = ', ';</script>')[0])
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 9e0aaa3..282c389 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -48,22 +48,20 @@ class NewgroundsExtractor(Extractor):
extr = text.extract_from(self.request(page_url).text)
full = text.extract_from(json.loads(extr('"full_image_text":', '});')))
data = {
+ "title" : text.unescape(extr('"og:title" content="', '"')),
"description": text.unescape(extr(':description" content="', '"')),
- "date" : extr('itemprop="datePublished" content="', '"'),
+ "date" : text.parse_datetime(extr(
+ 'itemprop="datePublished" content="', '"')),
"rating" : extr('class="rated-', '"'),
"favorites" : text.parse_int(extr('id="faves_load">', '<')),
"score" : text.parse_float(extr('id="score_number">', '<')),
+ "tags" : text.split_html(extr(
+ '<dd class="tags momag">', '</dd>')),
"url" : full('src="', '"'),
- "title" : text.unescape(full('alt="', '"')),
"width" : text.parse_int(full('width="', '"')),
"height" : text.parse_int(full('height="', '"')),
}
-
- tags = text.split_html(extr('<dd class="tags momag">', '</dd>'))
- tags.sort()
- data["tags"] = tags
-
- data["date"] = text.parse_datetime(data["date"])
+ data["tags"].sort()
data["index"] = text.parse_int(
data["url"].rpartition("/")[2].partition("_")[0])
return data
@@ -95,7 +93,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor):
test = (
("https://blitzwuff.newgrounds.com/art", {
"url": "24b19c4a135a09889fac7b46a74e427e4308d02b",
- "keyword": "2aab0532a894ff3cf88dd01ce5c60f114011b268",
+ "keyword": "98566e0c8096a8099b8d71962fea7e31c8b098d4",
}),
("https://blitzwuff.newgrounds.com/"),
)
@@ -140,9 +138,9 @@ class NewgroundsVideoExtractor(NewgroundsExtractor):
subcategory = "video"
filename_fmt = "{category}_{index}.{extension}"
pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$"
- test = ("https://twistedgrim.newgrounds.com/movies", {
+ test = ("https://tomfulp.newgrounds.com/movies", {
"pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+",
- "count": ">= 29",
+ "count": ">= 32",
})
def get_page_urls(self):
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index abf1eaa..4c48d73 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -106,13 +106,8 @@ class NijieExtractor(AsynchronousMixin, Extractor):
params = {"id": self.user_id, "p": 1}
while True:
- response = self.request(url, params=params, expect=(404,))
- if response.status_code == 404:
- raise exception.NotFoundError("artist")
-
- page = response.text
- ids = list(text.extract_iter(page, ' illust_id="', '"'))
- yield from ids
+ page = self.request(url, params=params, notfound="artist").text
+ yield from text.extract_iter(page, 'illust_id="', '"')
if '<a rel="next"' not in page:
return
@@ -126,7 +121,7 @@ class NijieUserExtractor(NijieExtractor):
r"/members(?:_illust)?\.php\?id=(\d+)")
test = (
("https://nijie.info/members_illust.php?id=44", {
- "url": "585d821df4716b1098660a0be426d01db4b65f2a",
+ "url": "66c4ff94c6e77c0765dd88f2d8c663055fda573e",
"keyword": "d629c69e3172db1d7e026145e8eb640ac31ac16a",
}),
("https://nijie.info/members_illust.php?id=43", {
@@ -174,7 +169,7 @@ class NijieImageExtractor(NijieExtractor):
r"/view(?:_popup)?\.php\?id=(\d+)")
test = (
("https://nijie.info/view.php?id=70720", {
- "url": "a10d4995645b5f260821e32c60a35f73546c2699",
+ "url": "5497f897311397dafa188521258624346a0af2a3",
"keyword": "408393d010307c76d52cbd0a4368d6d357805aea",
"content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6",
}),
@@ -190,10 +185,8 @@ class NijieImageExtractor(NijieExtractor):
self.page = ""
def get_job_metadata(self):
- response = self.request(self.view_url + self.image_id, expect=(404,))
- if response.status_code == 404:
- raise exception.NotFoundError("image")
- self.page = response.text
+ self.page = self.request(
+ self.view_url + self.image_id, notfound="image").text
self.user_id = text.extract(
self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0]
return NijieExtractor.get_job_metadata(self)
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index fa8cd48..f5b8869 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -228,14 +228,14 @@ class PinterestAPI():
params = {"data": json.dumps({"options": options}), "source_url": ""}
response = self.extractor.request(
- url, params=params, headers=self.HEADERS, expect=range(400, 500))
+ url, params=params, headers=self.HEADERS, fatal=False)
try:
data = response.json()
except ValueError:
data = {}
- if 200 <= response.status_code < 400 and not response.history:
+ if response.status_code < 400 and not response.history:
return data
if response.status_code == 404 or response.history:
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index af29c4b..76d4dc4 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -143,9 +143,7 @@ class PixivMeExtractor(PixivExtractor):
def items(self):
url = "https://pixiv.me/" + self.account
response = self.request(
- url, method="HEAD", allow_redirects=False, expect=(404,))
- if response.status_code == 404:
- raise exception.NotFoundError("user")
+ url, method="HEAD", allow_redirects=False, notfound="user")
yield Message.Version, 1
yield Message.Queue, response.headers["Location"], {}
@@ -445,7 +443,7 @@ class PixivAppAPI():
data["password"] = password
response = self.extractor.request(
- url, method="POST", data=data, expect=(400,))
+ url, method="POST", data=data, fatal=False)
if response.status_code >= 400:
raise exception.AuthenticationError()
@@ -491,10 +489,9 @@ class PixivAppAPI():
url = "https://app-api.pixiv.net/" + endpoint
self.login()
- response = self.extractor.request(
- url, params=params, expect=range(400, 500))
+ response = self.extractor.request(url, params=params, fatal=False)
- if 200 <= response.status_code < 400:
+ if response.status_code < 400:
return response.json()
if response.status_code == 404:
raise exception.NotFoundError()
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 0c5a924..2ba4b99 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -11,7 +11,6 @@
from .common import Extractor, Message
from .. import text, util, extractor, exception
from ..cache import cache
-import datetime
import time
@@ -235,8 +234,7 @@ class RedditAPI():
url = "https://oauth.reddit.com" + endpoint
params["raw_json"] = 1
self.authenticate()
- response = self.extractor.request(
- url, params=params, expect=range(400, 500))
+ response = self.extractor.request(url, params=params, fatal=False)
remaining = response.headers.get("x-ratelimit-remaining")
if remaining and float(remaining) < 2:
wait = int(response.headers["x-ratelimit-reset"])
@@ -252,12 +250,9 @@ class RedditAPI():
return data
def _pagination(self, endpoint, params, _empty=()):
- date_fmt = self.extractor.config("date-format", "%Y-%m-%dT%H:%M:%S")
- date_min = self._parse_datetime("date-min", 0, date_fmt)
- date_max = self._parse_datetime("date-max", 253402210800, date_fmt)
-
id_min = self._parse_id("id-min", 0)
id_max = self._parse_id("id-max", 2147483647)
+ date_min, date_max = self.extractor._get_date_min_max(0, 253402210800)
while True:
data = self._call(endpoint, params)["data"]
@@ -294,16 +289,6 @@ class RedditAPI():
if link_id and extra:
yield from self.morechildren(link_id, extra)
- def _parse_datetime(self, key, default, fmt):
- ts = self.extractor.config(key, default)
- if isinstance(ts, str):
- try:
- ts = int(datetime.datetime.strptime(ts, fmt).timestamp())
- except ValueError as exc:
- self.log.warning("Unable to parse '%s': %s", key, exc)
- ts = default
- return ts
-
def _parse_id(self, key, default):
sid = self.extractor.config(key)
return self._decode(sid.rpartition("_")[2].lower()) if sid else default
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index 22b2b63..55eda9f 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -110,8 +110,8 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor):
yield Message.Version, 1
while True:
url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
- response = self.request(url, expect=(404,))
- if response.status_code == 404:
+ response = self.request(url, fatal=False)
+ if response.status_code >= 400:
return
for url in text.extract_iter(response.text, 'data-direct="', '"'):
if url != last:
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
index f63c999..0d92573 100644
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@@ -43,9 +43,7 @@ class SeigaExtractor(Extractor):
"""Get url for an image with id 'image_id'"""
url = "{}/image/source/{}".format(self.root, image_id)
response = self.request(
- url, method="HEAD", allow_redirects=False, expect=(404,))
- if response.status_code == 404:
- raise exception.NotFoundError("image")
+ url, method="HEAD", allow_redirects=False, notfound="image")
return response.headers["Location"].replace("/o/", "/priv/", 1)
def login(self):
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index aa2b16b..afd4eaa 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -23,9 +23,9 @@ class SexcomExtractor(Extractor):
def items(self):
yield Message.Version, 1
yield Message.Directory, self.metadata()
- for url in self.pins():
- pin = self._parse_pin(url)
- yield Message.Url, pin["url"], pin
+ for pin in map(self._parse_pin, self.pins()):
+ if pin:
+ yield Message.Url, pin["url"], pin
def metadata(self):
return {}
@@ -49,8 +49,13 @@ class SexcomExtractor(Extractor):
return
url = text.urljoin(self.root, url)
- def _parse_pin(self, pin_url):
- extr = text.extract_from(self.request(pin_url).text)
+ def _parse_pin(self, url):
+ response = self.request(url, fatal=False)
+ if response.status_code >= 400:
+ self.log.warning('Unable to fetch %s ("%s: %s")',
+ url, response.status_code, response.reason)
+ return None
+ extr = text.extract_from(response.text)
data = {}
data["thumbnail"] = extr('itemprop="thumbnail" content="', '"')
@@ -88,10 +93,10 @@ class SexcomExtractor(Extractor):
class SexcomPinExtractor(SexcomExtractor):
- """Extractor a pinned image or video on www.sex.com"""
+ """Extractor for a pinned image or video on www.sex.com"""
subcategory = "pin"
directory_fmt = ("{category}",)
- pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)"
+ pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)(?!.*#related$)"
test = (
# picture
("https://www.sex.com/pin/56714360/", {
@@ -124,6 +129,10 @@ class SexcomPinExtractor(SexcomExtractor):
("https://www.sex.com/pin/55847384-very-nicely-animated/", {
"pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2",
}),
+ # 404
+ ("https://www.sex.com/pin/55847385/", {
+ "count": 0,
+ }),
)
def __init__(self, match):
@@ -134,6 +143,25 @@ class SexcomPinExtractor(SexcomExtractor):
return ("{}/pin/{}/".format(self.root, self.pin_id),)
+class SexcomRelatedPinExtractor(SexcomPinExtractor):
+ """Extractor for related pins on www.sex.com"""
+ subcategory = "related-pin"
+ directory_fmt = ("{category}", "related {original_pin[pin_id]}")
+ pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+).*#related$"
+ test = ("https://www.sex.com/pin/56714360/#related", {
+ "count": 24,
+ })
+
+ def metadata(self):
+ pin = self._parse_pin(SexcomPinExtractor.pins(self)[0])
+ return {"original_pin": pin}
+
+ def pins(self):
+ url = "{}/pin/related?pinId={}&limit=24&offset=0".format(
+ self.root, self.pin_id)
+ return self._pagination(url)
+
+
class SexcomBoardExtractor(SexcomExtractor):
"""Extractor for pins from a board on www.sex.com"""
subcategory = "board"
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index 35895bb..b2498a0 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -10,7 +10,6 @@
from .common import Extractor, Message, SharedConfigMixin, generate_extractors
from .. import text
-import time
import re
@@ -24,19 +23,9 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
Extractor.__init__(self, match)
self.item_url = self.root + match.group(1)
- def request(self, url, method="GET", expect=range(400, 500), **kwargs):
- tries = 0
- kwargs["expect"] = expect
- while True:
- response = Extractor.request(self, url, method, **kwargs)
- if response.status_code not in (429, 430):
- return response
- tries += 1
- waittime = 2 ** (tries + 2)
- self.log.warning(
- "HTTP status %s: %s - Waiting for %d seconds",
- response.status_code, response.reason, waittime)
- time.sleep(waittime)
+ def request(self, url, **kwargs):
+ kwargs["retries"] = float("inf")
+ return Extractor.request(self, url, **kwargs)
def items(self):
data = self.metadata()
@@ -45,9 +34,10 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
headers = {"X-Requested-With": "XMLHttpRequest"}
for url in self.products():
- response = self.request(url + ".json", headers=headers)
+ response = self.request(
+ url + ".json", headers=headers, fatal=False)
if response.status_code >= 400:
- self.log.warning('Skipping %s ("%d: %s")',
+ self.log.warning('Skipping %s ("%s: %s")',
url, response.status_code, response.reason)
continue
product = response.json()["product"]
@@ -89,10 +79,14 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
while True:
page = self.request(self.item_url, params=params).text
urls = search_re.findall(page)
+ last = None
if not urls:
return
for path in urls:
+ if last == path:
+ continue
+ last = path
yield self.root + path
params["page"] += 1
@@ -113,7 +107,7 @@ EXTRACTORS = {
"pattern": r"(?:www\.)?fashionnova\.com",
"test-product": (
("https://www.fashionnova.com/products/essential-slide-red", {
- "pattern": r"https?://cdn\.shopify.com/",
+ "pattern": r"https?://cdn\d*\.shopify.com/",
"count": 3,
}),
("https://www.fashionnova.com/collections/flats/products/name"),
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
index 44dc6fe..5ad372d 100644
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
(("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), {
"url": "258289249990502c3138719cb89e995a60861e49",
- "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b",
+ "keyword": "eba83ccdbab3022a2280c77aa747f9458196138b",
}),
("https://www.simply-hentai.com/notfound", {
"exception": exception.GalleryDLException,
@@ -40,30 +40,26 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
self.session.headers["Referer"] = url
def metadata(self, page):
- extr = text.extract
- title , pos = extr(page, '<meta property="og:title" content="', '"')
+ extr = text.extract_from(page)
+ split = text.split_html
+
+ title = extr('<meta property="og:title" content="', '"')
if not title:
raise exception.NotFoundError("gallery")
- gid , pos = extr(page, '/Album/', '/', pos)
- series, pos = extr(page, 'box-title">Series</div>', '</div>', pos)
- lang , pos = extr(page, 'box-title">Language</div>', '</div>', pos)
- chars , pos = extr(page, 'box-title">Characters</div>', '</div>', pos)
- tags , pos = extr(page, 'box-title">Tags</div>', '</div>', pos)
- artist, pos = extr(page, 'box-title">Artists</div>', '</div>', pos)
- date , pos = extr(page, 'Uploaded', '</div>', pos)
- lang = text.remove_html(lang) if lang else None
-
- return {
- "gallery_id": text.parse_int(gid),
+ data = {
"title" : text.unescape(title),
- "artist" : text.split_html(artist),
- "parody" : text.split_html(series),
- "characters": text.split_html(chars),
- "tags" : text.split_html(tags),
- "lang" : util.language_to_code(lang),
- "language" : lang,
- "date" : text.remove_html(date),
+ "gallery_id": text.parse_int(extr('/Album/', '/')),
+ "parody" : split(extr('box-title">Series</div>', '</div>')),
+ "language" : text.remove_html(extr(
+ 'box-title">Language</div>', '</div>')) or None,
+ "characters": split(extr('box-title">Characters</div>', '</div>')),
+ "tags" : split(extr('box-title">Tags</div>', '</div>')),
+ "artist" : split(extr('box-title">Artists</div>', '</div>')),
+ "date" : text.parse_datetime(text.remove_html(
+ extr('Uploaded', '</div>')), "%d.%m.%Y"),
}
+ data["lang"] = util.language_to_code(data["language"])
+ return data
def images(self, _):
url = self.chapter_url + "/all-pages"
@@ -102,12 +98,11 @@ class SimplyhentaiImageExtractor(Extractor):
self.type = match.group(2)
def items(self):
- page = self.request(self.page_url).text
- url_search = 'data-src="' if self.type == "image" else '<source src="'
-
- title, pos = text.extract(page, '"og:title" content="', '"')
- descr, pos = text.extract(page, '"og:description" content="', '"', pos)
- url , pos = text.extract(page, url_search, '"', pos)
+ extr = text.extract_from(self.request(self.page_url).text)
+ title = extr('"og:title" content="' , '"')
+ descr = extr('"og:description" content="', '"')
+ url = extr('&quot;image&quot;:&quot;' , '&')
+ url = extr("&quot;content&quot;:&quot;", "&") or url
tags = text.extract(descr, " tagged with ", " online for free ")[0]
if tags:
@@ -140,13 +135,13 @@ class SimplyhentaiVideoExtractor(Extractor):
("https://videos.simply-hentai.com/creamy-pie-episode-02", {
"pattern": r"https://www\.googleapis\.com/drive/v3/files"
r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+",
- "keyword": "29d63987fed33f0a9f4b3786d1d71b03d793250a",
+ "keyword": "706790708b14773efc1e075ddd3b738a375348a5",
"count": 1,
}),
(("https://videos.simply-hentai.com"
"/1715-tifa-in-hentai-gang-bang-3d-movie"), {
"url": "ad9a36ae06c601b6490e3c401834b4949d947eb0",
- "keyword": "c561341aa3c6999f615abf1971d28fb2a83da2a7",
+ "keyword": "f9dad94fbde9c95859e631ff4f07297a9567b874",
}),
)
@@ -178,8 +173,9 @@ class SimplyhentaiVideoExtractor(Extractor):
"title": text.unescape(title),
"episode": text.parse_int(episode),
"tags": text.split_html(tags)[::2],
- "date": text.remove_html(date),
"type": "video",
+ "date": text.parse_datetime(text.remove_html(
+ date), "%B %d, %Y %H:%M"),
})
yield Message.Version, 1
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index 80348ae..2e6508c 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -69,11 +69,11 @@ class SmugmugAlbumExtractor(SmugmugExtractor):
archive_fmt = "a_{Album[AlbumKey]}_{Image[ImageKey]}"
pattern = r"smugmug:album:([^:]+)$"
test = (
- ("smugmug:album:ddvxpg", {
- "url": "0429e9bf50ee600674e448934e3882ca1761ae7b",
+ ("smugmug:album:cr4C7f", {
+ "url": "1436ee98d5797b308ecce5862e4885944f59c03c",
}),
# empty
- ("smugmug:album:SXvjbW", {
+ ("smugmug:album:Fb7hMs", {
"count": 0,
}),
# no "User"
@@ -109,10 +109,10 @@ class SmugmugImageExtractor(SmugmugExtractor):
archive_fmt = "{Image[ImageKey]}"
pattern = BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#-]+)"
test = (
- ("https://acapella.smugmug.com/Micro-Macro/Drops/i-g2Dmf9z", {
- "url": "78f0bf3516b6d670b7319216bdeccb35942ca4cf",
- "keyword": "b298ef7ed2b1918263b6a7dc6f56e54401584381",
- "content": "64a8f69a1d824921eebbdf2420087937adfa45cd",
+ ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", {
+ "url": "f624ad7293afd6412a7d34e3950a118596c36c85",
+ "keyword": "ea70e93be5067dca988d871dcf9afac491a189a4",
+ "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0",
}),
# video
("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", {
@@ -142,12 +142,12 @@ class SmugmugPathExtractor(SmugmugExtractor):
subcategory = "path"
pattern = BASE_PATTERN + r"((?:/[^/?&#a-fh-mo-z][^/?&#]*)*)/?$"
test = (
- ("https://acapella.smugmug.com/Micro-Macro/Drops/", {
- "pattern": "smugmug:album:ddvxpg$",
+ ("https://tdm.smugmug.com/Nature/Dove", {
+ "pattern": "smugmug:album:cr4C7f$",
}),
- ("https://acapella.smugmug.com/", {
+ ("https://tdm.smugmug.com/", {
"pattern": SmugmugAlbumExtractor.pattern,
- "url": "797eb1cbbf5ad8ecac8ee4eedc6466ed77a65d68",
+ "url": "1640028712875b90974e5aecd91b60e6de6138c7",
}),
# gallery node without owner
("https://www.smugmug.com/gallery/n-GLCjnD/", {
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index 62a9173..03ee144 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -107,9 +107,9 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
def images(self, page):
url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
headers = {"Referer": self.chapter_url}
- response = self.request(url, headers=headers, expect=(404,))
+ response = self.request(url, headers=headers, fatal=False)
- if response.status_code == 404:
+ if response.status_code >= 400:
url = "{}/Read/View/{}".format(self.root, self.gallery_id)
self.log.error(
"Failed to get gallery JSON data. Visit '%s' in a browser "
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 5679cdc..024d6e9 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -65,11 +65,15 @@ class TumblrExtractor(Extractor):
if self.reblogs == "same-blog":
self._skip_reblog = self._skip_reblog_same_blog
+ self.date_min, self.api.before = self._get_date_min_max(0, None)
+
def items(self):
blog = None
yield Message.Version, 1
for post in self.posts():
+ if self.date_min > post["timestamp"]:
+ return
if post["type"] not in self.types:
continue
if not blog:
@@ -207,7 +211,7 @@ class TumblrUserExtractor(TumblrExtractor):
("http://demo.tumblr.com/", {
"pattern": (r"https?://(?:$|"
r"\d+\.media\.tumblr\.com/.+_1280\.jpg|"
- r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"),
+ r"a\.tumblr\.com/tumblr_\w+)"),
"count": 3,
"options": (("posts", "all"), ("external", True))
}),
@@ -223,6 +227,11 @@ class TumblrUserExtractor(TumblrExtractor):
"count": 2,
"keyword": {"tags": ["test", "private", "hidden"]},
}),
+ ("https://mikf123.tumblr.com/", { # date-min/-max/-format (#337)
+ "count": 4,
+ "options": (("date-min", "201804"), ("date-max", "201805"),
+ ("date-format", "%Y%m"))
+ }),
("https://demo.tumblr.com/page/2"),
("https://demo.tumblr.com/archive"),
("tumblr:http://www.b-authentique.com/"),
@@ -280,6 +289,7 @@ class TumblrPostExtractor(TumblrExtractor):
TumblrExtractor.__init__(self, match)
self.post_id = match.group(3)
self.reblogs = True
+ self.date_min = 0
def posts(self):
return self.api.posts(self.blog, {"id": self.post_id})
@@ -328,7 +338,7 @@ class TumblrAPI(oauth.OAuth1API):
def __init__(self, extractor):
oauth.OAuth1API.__init__(self, extractor)
- self.posts_type = None
+ self.posts_type = self.before = None
def info(self, blog):
"""Return general information about a blog"""
@@ -350,6 +360,8 @@ class TumblrAPI(oauth.OAuth1API):
params.update({"offset": 0, "limit": 50, "reblog_info": "true"})
if self.posts_type:
params["type"] = self.posts_type
+ if self.before:
+ params["before"] = self.before
while True:
data = self._call(blog, "posts", params)
self.BLOG_CACHE[blog] = data["blog"]
@@ -360,7 +372,7 @@ class TumblrAPI(oauth.OAuth1API):
def likes(self, blog):
"""Retrieve liked posts"""
- params = {"limit": 50}
+ params = {"limit": "50", "before": self.before}
while True:
posts = self._call(blog, "likes", params)["liked_posts"]
if not posts:
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index ad4dc46..ccba640 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -11,6 +11,7 @@
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
+import re
class TwitterExtractor(Extractor):
@@ -26,8 +27,13 @@ class TwitterExtractor(Extractor):
Extractor.__init__(self, match)
self.user = match.group(1)
self.retweets = self.config("retweets", True)
+ self.content = self.config("content", False)
self.videos = self.config("videos", False)
+ if self.content:
+ self._emoji_sub = re.compile(
+ r'<img class="Emoji [^>]+ alt="([^"]+)"[^>]*>').sub
+
def items(self):
self.login()
yield Message.Version, 1
@@ -35,6 +41,7 @@ class TwitterExtractor(Extractor):
for tweet in self.tweets():
data = self._data_from_tweet(tweet)
+
if not self.retweets and data["retweet_id"]:
continue
@@ -87,10 +94,9 @@ class TwitterExtractor(Extractor):
raise exception.AuthenticationError()
return self.session.cookies
- @staticmethod
- def _data_from_tweet(tweet):
+ def _data_from_tweet(self, tweet):
extr = text.extract_from(tweet)
- return {
+ data = {
"tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
"retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
"retweeter" : extr('data-retweeter="' , '"'),
@@ -99,6 +105,14 @@ class TwitterExtractor(Extractor):
"user_id" : text.parse_int(extr('data-user-id="' , '"')),
"date" : text.parse_timestamp(extr('data-time="', '"')),
}
+ if self.content:
+ content = extr('<div class="js-tweet-text-container">', '\n</div>')
+ if '<img class="Emoji ' in content:
+ content = self._emoji_sub(r"\1", content)
+ content = text.unescape(text.remove_html(content, "", ""))
+ cl, _, cr = content.rpartition("pic.twitter.com/")
+ data["content"] = cl if cl and len(cr) < 16 else content
+ return data
def _tweets_from_api(self, url):
params = {
@@ -186,6 +200,11 @@ class TwitterTweetExtractor(TwitterExtractor):
"options": (("videos", True),),
"pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+",
}),
+ # content with emoji, newlines, hashtags (#338)
+ ("https://twitter.com/yumi_san0112/status/1151144618936823808", {
+ "options": (("content", True),),
+ "keyword": "b13b6c4cd0b0c15b2ea7685479e7fedde3c47b9e",
+ }),
)
def __init__(self, match):
@@ -199,4 +218,4 @@ class TwitterTweetExtractor(TwitterExtractor):
url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id)
page = self.request(url).text
return (text.extract(
- page, '<div class="tweet ', '<ul class="stats')[0],)
+ page, '<div class="tweet ', 'class="js-tweet-stats-container')[0],)
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
index 7eec18b..e253b7f 100644
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@@ -18,12 +18,6 @@ class XvideosExtractor(Extractor):
category = "xvideos"
root = "https://www.xvideos.com"
- def get_page(self, url, codes=(403, 404)):
- response = self.request(url, expect=codes)
- if response.status_code in codes:
- raise exception.NotFoundError(self.subcategory)
- return response.text
-
class XvideosGalleryExtractor(XvideosExtractor):
"""Extractor for user profile galleries from xvideos.com"""
@@ -37,7 +31,7 @@ class XvideosGalleryExtractor(XvideosExtractor):
(("https://www.xvideos.com/profiles"
"/pervertedcouple/photos/751031/random_stuff"), {
"url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7",
- "keyword": "8d637b372c6231cc4ada92dd5918db5fdbd06520",
+ "keyword": "65979d63a69576cf692b41d5fbbd995cc40a51b9",
}),
("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", {
"exception": exception.NotFoundError,
@@ -50,7 +44,7 @@ class XvideosGalleryExtractor(XvideosExtractor):
def items(self):
url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid)
- page = self.get_page(url)
+ page = self.request(url, notfound=self.subcategory).text
data = self.get_metadata(page)
imgs = self.get_images(page)
data["count"] = len(imgs)
@@ -113,7 +107,7 @@ class XvideosUserExtractor(XvideosExtractor):
def items(self):
url = "{}/profiles/{}".format(self.root, self.user)
- page = self.get_page(url)
+ page = self.request(url, notfound=self.subcategory).text
data = json.loads(text.extract(
page, "xv.conf=", ";</script>")[0])["data"]