diff options
| author | 2021-11-01 05:03:49 -0400 | |
|---|---|---|
| committer | 2021-11-01 05:03:49 -0400 | |
| commit | 4a965d875415907cc1a016b428ae305a964f9228 (patch) | |
| tree | 7cece9948a7ba390348e00c669f9cb1f7a9ba39a /gallery_dl/extractor/vk.py | |
| parent | 34ba2951b8c523713425c98addb9256ea05c946f (diff) | |
New upstream version 1.19.1.upstream/1.19.1
Diffstat (limited to 'gallery_dl/extractor/vk.py')
| -rw-r--r-- | gallery_dl/extractor/vk.py | 132 |
1 files changed, 91 insertions, 41 deletions
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 9dd2d47..9724c4b 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -12,18 +12,67 @@ from .common import Extractor, Message from .. import text import re +BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" -class VkPhotosExtractor(Extractor): - """Extractor for photos from a vk user""" + +class VkExtractor(Extractor): + """Base class for vk extractors""" category = "vk" - subcategory = "photos" directory_fmt = ("{category}", "{user[name]|user[id]}") filename_fmt = "{id}.{extension}" archive_fmt = "{id}" root = "https://vk.com" request_interval = 1.0 - pattern = (r"(?:https://)?(?:www\.|m\.)?vk\.com/(?:" - r"(?:albums|photos|id)(-?\d+)|([^/?#]+))") + + def items(self): + data = self.metadata() + yield Message.Directory, data + for photo in self.photos(): + photo.update(data) + yield Message.Url, photo["url"], photo + + def _pagination(self, photos_url, user_id): + sub = re.compile(r"/imp[fg]/").sub + needle = 'data-id="{}_'.format(user_id) + cnt = 0 + + headers = { + "X-Requested-With": "XMLHttpRequest", + "Origin" : self.root, + "Referer" : photos_url, + } + params = { + "al" : "1", + "al_ad" : "0", + "offset": 0, + "part" : "1", + } + + while True: + payload = self.request( + photos_url, method="POST", headers=headers, data=params + ).json()["payload"][1] + + offset = payload[0] + html = payload[1] + + for cnt, photo in enumerate(text.extract_iter(html, needle, ')')): + pid = photo[:photo.find('"')] + url = photo[photo.rindex("(")+1:] + url = sub("/", url.partition("?")[0]) + yield text.nameext_from_url(url, {"url": url, "id": pid}) + + if cnt <= 20 or offset == params["offset"]: + return + params["offset"] = offset + + +class VkPhotosExtractor(VkExtractor): + """Extractor for photos from a vk user""" + subcategory = "photos" + pattern = (BASE_PATTERN + r"/(?:" + r"(?:albums|photos|id)(-?\d+)" + r"|(?!album-?\d+_)([^/?#]+))") test = ( ("https://vk.com/id398982326", { "pattern": r"https://sun\d+-\d+\.userapi\.com/c\d+/v\d+" @@ -58,10 +107,14 @@ class VkPhotosExtractor(Extractor): ) def __init__(self, match): - Extractor.__init__(self, match) + VkExtractor.__init__(self, match) self.user_id, self.user_name = match.groups() - def items(self): + def photos(self): + url = "{}/photos{}".format(self.root, self.user_id) + return self._pagination(url, self.user_id) + + def metadata(self): if self.user_id: user_id = self.user_id prefix = "public" if user_id[0] == "-" else "id" @@ -70,40 +123,8 @@ class VkPhotosExtractor(Extractor): else: url = "{}/{}".format(self.root, self.user_name) data = self._extract_profile(url) - user_id = data["user"]["id"] - - photos_url = "{}/photos{}".format(self.root, user_id) - headers = { - "X-Requested-With": "XMLHttpRequest", - "Origin" : self.root, - "Referer" : photos_url, - } - params = { - "al" : "1", - "al_ad" : "0", - "offset": 0, - "part" : "1", - } - - yield Message.Directory, data - sub = re.compile(r"/imp[fg]/").sub - needle = 'data-id="{}_'.format(user_id) - cnt = 0 - - while True: - offset, html = self.request( - photos_url, method="POST", headers=headers, data=params - ).json()["payload"][1] - - for cnt, photo in enumerate(text.extract_iter(html, needle, ')')): - data["id"] = photo[:photo.find('"')] - url = photo[photo.rindex("(")+1:] - url = sub("/", url.partition("?")[0]) - yield Message.Url, url, text.nameext_from_url(url, data) - - if cnt <= 40 or offset == params["offset"]: - return - params["offset"] = offset + self.user_id = data["user"]["id"] + return data def _extract_profile(self, url): extr = text.extract_from(self.request(url).text) @@ -116,3 +137,32 @@ class VkPhotosExtractor(Extractor): '<span class="current_text">', '</span'))), "id" : extr('<a href="/albums', '"'), }} + + +class VkAlbumExtractor(VkExtractor): + """Extractor for a vk album""" + subcategory = "album" + directory_fmt = ("{category}", "{user[id]}", "{album[id]}") + pattern = BASE_PATTERN + r"/album(-?\d+)_(\d+)$" + test = ( + ("https://vk.com/album221469416_0", { + "count": 3, + }), + ("https://vk.com/album-165740836_281339889", { + "count": 12, + }), + ) + + def __init__(self, match): + VkExtractor.__init__(self, match) + self.user_id, self.album_id = match.groups() + + def photos(self): + url = "{}/album{}_{}".format(self.root, self.user_id, self.album_id) + return self._pagination(url, self.user_id) + + def metadata(self): + return { + "user": {"id": self.user_id}, + "album": {"id": self.album_id}, + } |
