diff options
Diffstat (limited to 'gallery_dl/extractor/zerochan.py')
| -rw-r--r-- | gallery_dl/extractor/zerochan.py | 41 |
1 files changed, 32 insertions, 9 deletions
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index fc61dff..126ef49 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -11,6 +11,8 @@ from .booru import BooruExtractor from ..cache import cache from .. import text, util, exception +import collections +import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -76,22 +78,29 @@ class ZerochanExtractor(BooruExtractor): 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('<ul id="tags"', '</ul>'), - "source" : extr('<h2>Source</h2>', '</p><h2>').rpartition( - ">")[2] or None, + "source" : text.unescape(text.extr( + extr('id="source-url"', '</a>'), 'href="', '"')), } html = data["tags"] tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: - category = text.extr(tag, 'data-type="', '"') + category = text.extr(tag, '"', '"') name = text.extr(tag, 'data-tag="', '"') - tags.append(category.capitalize() + ":" + name) + tags.append(category.partition(" ")[0].capitalize() + ":" + name) return data def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) - item = self.request(url).json() + text = self.request(url).text + try: + item = util.json_loads(text) + except ValueError as exc: + if " control character " not in str(exc): + raise + text = re.sub(r"[\x00-\x1f\x7f]", "", text) + item = util.json_loads(text) data = { "id" : item["id"], @@ -109,6 +118,14 @@ class ZerochanExtractor(BooruExtractor): return data + def _tags(self, post, page): + tags = collections.defaultdict(list) + for tag in post["tags"]: + category, _, name = tag.partition(":") + tags[category].append(name) + for key, value in tags.items(): + post["tags_" + key.lower()] = value + class ZerochanTagExtractor(ZerochanExtractor): subcategory = "tag" @@ -180,10 +197,16 @@ class ZerochanTagExtractor(ZerochanExtractor): static = "https://static.zerochan.net/.full." while True: - data = self.request(url, params=params).json() + response = self.request(url, params=params, allow_redirects=False) + if response.status_code >= 300: + url = text.urljoin(self.root, response.headers["location"]) + response = self.request(url, params=params) + data = response.json() + try: posts = data["items"] - except ValueError: + except Exception: + self.log.debug("Server response: %s", data) return if metadata: @@ -191,13 +214,13 @@ class ZerochanTagExtractor(ZerochanExtractor): post_id = post["id"] post.update(self._parse_entry_html(post_id)) post.update(self._parse_entry_api(post_id)) + yield post else: for post in posts: base = static + str(post["id"]) post["file_url"] = base + ".jpg" post["_fallback"] = (base + ".png",) - - yield from posts + yield post if not data.get("next"): return |
