summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/gelbooru_v02.py
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2022-11-22 04:28:38 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2022-11-22 04:28:38 -0500
commit7af5cc29d1c02d20a6890b7b7ba78ab41532a763 (patch)
tree4f0366e5653074c7eb31ac7ca59a1ee55f2d736e /gallery_dl/extractor/gelbooru_v02.py
parente59d46ecda74190381b1d2725b0bd9df5c0be8d8 (diff)
New upstream version 1.24.0.upstream/1.24.0
Diffstat (limited to 'gallery_dl/extractor/gelbooru_v02.py')
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py182
1 files changed, 127 insertions, 55 deletions
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 8214614..da87b8f 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -31,6 +31,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
if self.category == "realbooru":
self._file_url = self._file_url_realbooru
+ self._tags = self._tags_realbooru
def _api_request(self, params):
url = self.api_root + "/index.php?page=dapi&s=post&q=index"
@@ -85,55 +86,58 @@ class GelbooruV02Extractor(booru.BooruExtractor):
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ def _html(self, post):
+ return self.request("{}/index.php?page=post&s=view&id={}".format(
+ self.root, post["id"])).text
+
+ def _tags(self, post, page):
+ tag_container = (text.extr(page, '<ul id="tag-', '</ul>') or
+ text.extr(page, '<ul class="tag-', '</ul>'))
+ if not tag_container:
+ return
+
+ tags = collections.defaultdict(list)
+ pattern = re.compile(
+ r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
+ for tag_type, tag_name in pattern.findall(tag_container):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
+
+ def _notes(self, post, page):
+ note_container = text.extr(page, 'id="note-container"', "<img ")
+ if not note_container:
+ return
+
+ post["notes"] = notes = []
+ for note in note_container.split('class="note-box"')[1:]:
+ extr = text.extract_from(note)
+ notes.append({
+ "width" : int(extr("width:", "p")),
+ "height": int(extr("height:", "p")),
+ "y" : int(extr("top:", "p")),
+ "x" : int(extr("left:", "p")),
+ "id" : int(extr('id="note-body-', '"')),
+ "body" : text.unescape(text.remove_html(extr(">", "</div>"))),
+ })
+
def _file_url_realbooru(self, post):
url = post["file_url"]
- if url.count("/") == 5:
- md5 = post["md5"]
+ md5 = post["md5"]
+ if md5 not in post["preview_url"] or url.count("/") == 5:
url = "{}/images/{}/{}/{}.{}".format(
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url
- def _extended_tags(self, post, page=None):
- if not page:
- url = "{}/index.php?page=post&s=view&id={}".format(
- self.root, post["id"])
- page = self.request(url).text
- html = text.extract(page, '<ul id="tag-', '</ul>')[0]
- if not html:
- html = text.extract(page, '<ul class="tag-', '</ul>')[0]
- if html:
- tags = collections.defaultdict(list)
- pattern = re.compile(
- r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
- for tag_type, tag_name in pattern.findall(html):
- tags[tag_type].append(text.unquote(tag_name))
- for key, value in tags.items():
- post["tags_" + key] = " ".join(value)
- return page
-
- def _notes(self, post, page=None):
- if not page:
- url = "{}/index.php?page=post&s=view&id={}".format(
- self.root, post["id"])
- page = self.request(url).text
- notes = []
- notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
- if not notes_data:
- return
-
- note_iter = text.extract_iter(notes_data, '<article', '</article>')
- extr = text.extract
- for note_data in note_iter:
- note = {
- "width": int(extr(note_data, 'data-width="', '"')[0]),
- "height": int(extr(note_data, 'data-height="', '"')[0]),
- "x": int(extr(note_data, 'data-x="', '"')[0]),
- "y": int(extr(note_data, 'data-y="', '"')[0]),
- "body": extr(note_data, 'data-body="', '"')[0],
- }
- notes.append(note)
-
- post["notes"] = notes
+ def _tags_realbooru(self, post, page):
+ tag_container = text.extr(page, 'id="tagLink"', '</div>')
+ tags = collections.defaultdict(list)
+ pattern = re.compile(
+ r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
+ for tag_type, tag_name in pattern.findall(tag_container):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
INSTANCES = {
@@ -310,15 +314,81 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
test = (
- ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
- "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
- "options": (("tags", True),),
+ ("https://rule34.xxx/index.php?page=post&s=view&id=863", {
+ "pattern": r"https://api-cdn\.rule34\.xxx/images"
+ r"/1/6aafbdb3e22f3f3b412ea2cf53321317a37063f3\.jpg",
+ "content": ("a43f418aa350039af0d11cae501396a33bbe2201",
+ "67b516295950867e1c1ab6bc13b35d3b762ed2a3"),
+ "options": (("tags", True), ("notes", True)),
"keyword": {
- "tags_artist": "danraku",
- "tags_character": "kashima_(kantai_collection)",
- "tags_copyright": "kantai_collection",
+ "tags_artist": "reverse_noise yamu_(reverse_noise)",
+ "tags_character": "hong_meiling",
+ "tags_copyright": "touhou",
"tags_general": str,
- "tags_metadata": str,
+ "tags_metadata": "censored translated",
+ "notes": [
+ {
+ "body": "It feels angry, I'm losing myself... "
+ "It won't calm down!",
+ "height": 65,
+ "id": 93586,
+ "width": 116,
+ "x": 22,
+ "y": 333,
+ },
+ {
+ "body": "REPUTATION OF RAGE",
+ "height": 272,
+ "id": 93587,
+ "width": 199,
+ "x": 78,
+ "y": 442,
+ },
+ ],
+
+ },
+ }),
+ ("https://hypnohub.net/index.php?page=post&s=view&id=1439", {
+ "pattern": r"https://hypnohub\.net/images"
+ r"/90/24/90245c3c5250c2a8173255d3923a010b\.jpg",
+ "content": "5987c5d2354f22e5fa9b7ee7ce4a6f7beb8b2b71",
+ "options": (("tags", True), ("notes", True)),
+ "keyword": {
+ "tags_artist": "brokenteapot",
+ "tags_character": "hsien-ko",
+ "tags_copyright": "capcom darkstalkers",
+ "tags_general": str,
+ "tags_metadata": "dialogue text translated",
+ "notes": [
+ {
+ "body": "Master Master Master "
+ "Master Master Master",
+ "height": 83,
+ "id": 10577,
+ "width": 129,
+ "x": 259,
+ "y": 20,
+ },
+ {
+ "body": "Response Response Response "
+ "Response Response Response",
+ "height": 86,
+ "id": 10578,
+ "width": 125,
+ "x": 126,
+ "y": 20,
+ },
+ {
+ "body": "Obedience Obedience Obedience "
+ "Obedience Obedience Obedience",
+ "height": 80,
+ "id": 10579,
+ "width": 98,
+ "x": 20,
+ "y": 20,
+ },
+ ],
+
},
}),
("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
@@ -336,16 +406,18 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
"pattern": r"https://realbooru\.com/images/dc/b5"
r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg",
"content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_general": "1girl blonde blonde_hair blue_eyes cute "
+ "female female_only looking_at_viewer smile "
+ "solo solo_female teeth",
+ "tags_model": "jennifer_lawrence",
+ },
}),
("https://tbib.org/index.php?page=post&s=view&id=9233957", {
"url": "5a6ebe07bfff8e6d27f7c30b5480f27abcb577d2",
"content": "1c3831b6fbaa4686e3c79035b5d98460b1c85c43",
}),
- ("https://hypnohub.net/index.php?page=post&s=view&id=73964", {
- "pattern": r"https://hypnohub\.net/images/7a/37"
- r"/7a37c0ba372f35767fb10c904a398831\.png",
- "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
- }),
)
def __init__(self, match):