summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/patreon.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/patreon.py')
-rw-r--r--gallery_dl/extractor/patreon.py82
1 files changed, 52 insertions, 30 deletions
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 1e52559..0d51df2 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,6 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
from ..cache import memcache
import collections
+import itertools
import json
@@ -33,43 +34,62 @@ class PatreonExtractor(Extractor):
PatreonExtractor._warning = False
for post in self.posts():
- ids = set()
post["num"] = 0
- content = post.get("content")
- postfile = post.get("post_file")
+ hashes = set()
yield Message.Directory, post
yield Message.Metadata, text.nameext_from_url(
post["creator"].get("image_url", ""), post)
- for image in post["images"]:
- url = image.get("download_url")
- if not url:
- continue
- ids.add(url.split("/")[-2])
- name = image.get("file_name") or self._filename(url) or url
+ for kind, url, name in itertools.chain(
+ self._postfile(post),
+ self._images(post),
+ self._attachments(post),
+ self._content(post),
+ ):
+ fhash = url.rsplit("/", 2)[1]
+ if fhash not in hashes:
+ hashes.add(fhash)
+ post["hash"] = fhash
+ post["type"] = kind
+ post["num"] += 1
+ yield Message.Url, url, text.nameext_from_url(name, post)
+ else:
+ self.log.debug("skipping %s (%s %s)", url, fhash, kind)
- post["num"] += 1
- post["type"] = "image"
- yield Message.Url, url, text.nameext_from_url(name, post)
+ @staticmethod
+ def _postfile(post):
+ postfile = post.get("post_file")
+ if postfile:
+ return (("postfile", postfile["url"], postfile["name"]),)
+ return ()
+
+ def _images(self, post):
+ for image in post["images"]:
+ url = image.get("download_url")
+ if url:
+ name = image.get("file_name") or self._filename(url) or url
+ yield "image", url, name
- if postfile and postfile["url"].split("/")[-2] not in ids:
- post["num"] += 1
- post["type"] = "postfile"
- text.nameext_from_url(postfile["name"], post)
- yield Message.Url, postfile["url"], post
+ def _attachments(self, post):
+ for attachment in post["attachments"]:
+ url = self.request(
+ attachment["url"], method="HEAD",
+ allow_redirects=False, fatal=False,
+ ).headers.get("Location")
- for attachment in post["attachments"]:
- post["num"] += 1
- post["type"] = "attachment"
- text.nameext_from_url(attachment["name"], post)
- yield Message.Url, attachment["url"], post
+ if url:
+ yield "attachment", url, attachment["name"]
- if content:
- for url in text.extract_iter(content, 'src="', '"'):
- post["num"] += 1
- post["type"] = "content"
- yield Message.Url, url, text.nameext_from_url(url, post)
+ @staticmethod
+ def _content(post):
+ content = post.get("content")
+ if content:
+ for img in text.extract_iter(
+ content, '<img data-media-id="', '>'):
+ url = text.extract(img, 'src="', '"')[0]
+ if url:
+ yield "content", url, url
def posts(self):
"""Return all relevant post objects"""
@@ -238,11 +258,13 @@ class PatreonPostExtractor(PatreonExtractor):
subcategory = "post"
pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?&#]+)"
test = (
+ # postfile + attachments
("https://www.patreon.com/posts/precious-metal-23563293", {
"count": 4,
}),
- ("https://www.patreon.com/posts/er1-28201153", {
- "count": 1,
+ # postfile + content
+ ("https://www.patreon.com/posts/19987002", {
+ "count": 4,
}),
("https://www.patreon.com/posts/not-found-123", {
"exception": exception.NotFoundError,