New upstream version 1.10.1upstream/1.10.1

author: Unit 193 <unit193@ubuntu.com> 2019-08-04 17:52:59 -0400
committer: Unit 193 <unit193@ubuntu.com> 2019-08-04 17:52:59 -0400
commit: 64ad8e7bd15df71ab1116eede414558631bcad32 (patch)
tree: 7416e191aedce591087903a943198aed13fa0b26 /gallery_dl/extractor
parent: 2a63a9c9b7032a76894c48ac4d9cea732fcaee49 (diff)
13 files changed, 509 insertions, 48 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 189c163..0b24111 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -17,6 +17,7 @@ modules = [
     "500px",
     "8chan",
     "8muses",
+    "adultempire",
     "artstation",
     "behance",
     "bobx",
@@ -42,6 +43,7 @@ modules = [
     "idolcomplex",
     "imagebam",
     "imagefap",
+    "imgbb",
     "imgbox",
     "imgth",
     "imgur",
@@ -95,6 +97,7 @@ modules = [
     "tumblr",
     "twitter",
     "vanillarock",
+    "vsco",
     "wallhaven",
     "warosu",
     "weibo",
diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py
new file mode 100644
index 0000000..5ea835f
--- /dev/null
+++ b/gallery_dl/extractor/adultempire.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.adultempire.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class AdultempireGalleryExtractor(GalleryExtractor):
+    """Extractor for image galleries from www.adultempire.com"""
+    category = "adultempire"
+    root = "https://www.adultempire.com"
+    pattern = (r"(?:https?://)?(?:www\.)?adult(?:dvd)?empire\.com"
+               r"(/(\d+)/gallery\.html)")
+    test = (
+        ("https://www.adultempire.com/5998/gallery.html", {
+            "range": "1",
+            "keyword": "0533ef1184892be8ac02b17286797c95f389ba63",
+            "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e",
+        }),
+        ("https://www.adultdvdempire.com/5683/gallery.html", {
+            "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d",
+            "keyword": "59fe5d95929efc5040a819a5f77aba7a022bb85a",
+        }),
+    )
+
+    def __init__(self, match):
+        GalleryExtractor.__init__(self, match)
+        self.gallery_id = match.group(2)
+
+    def metadata(self, page):
+        extr = text.extract_from(page, page.index('<div id="content">'))
+        return {
+            "gallery_id": text.parse_int(self.gallery_id),
+            "title"     : text.unescape(extr('title="', '"')),
+            "studio"    : extr(">studio</small>", "<").strip(),
+            "date"      : text.parse_datetime(extr(
+                ">released</small>", "<").strip(), "%m/%d/%Y"),
+            "actors"    : text.split_html(extr(
+                '<ul class="item-details item-cast-list ', '</ul>'))[1:],
+        }
+
+    def images(self, page):
+        params = {"page": 1}
+        while True:
+            urls = list(text.extract_iter(page, 'rel="L"><img src="', '"'))
+            for url in urls:
+                yield url.replace("_200.", "_9600."), None
+            if len(urls) < 24:
+                return
+            params["page"] += 1
+            page = self.request(self.chapter_url, params=params).text
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index 111d560..467a935 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -30,7 +30,8 @@ class BehanceExtractor(Extractor):
     @staticmethod
     def _update(data):
         # compress data to simple lists
-        data["fields"] = [field["name"] for field in data["fields"]]
+        if data["fields"] and isinstance(data["fields"][0], dict):
+            data["fields"] = [field["name"] for field in data["fields"]]
         data["owners"] = [owner["display_name"] for owner in data["owners"]]
         if "tags" in data:
             data["tags"] = [tag["title"] for tag in data["tags"]]
@@ -140,11 +141,11 @@ class BehanceUserExtractor(BehanceExtractor):
 
     def galleries(self):
         url = "{}/{}/projects".format(self.root, self.user)
-        headers = {"X-Requested-With": "XMLHttpRequest"}
         params = {"offset": 0}
+        headers = {"X-Requested-With": "XMLHttpRequest"}
 
         while True:
-            data = self.request(url, headers=headers, params=params).json()
+            data = self.request(url, params=params, headers=headers).json()
             work = data["profile"]["activeSection"]["work"]
             yield from work["projects"]
             if not work["hasMore"]:
@@ -157,8 +158,8 @@ class BehanceCollectionExtractor(BehanceExtractor):
     subcategory = "collection"
     categorytransfer = True
     pattern = r"(?:https?://)?(?:www\.)?behance\.net/collection/(\d+)"
-    test = ("https://www.behance.net/collection/170615607/Sky", {
-        "count": ">= 13",
+    test = ("https://www.behance.net/collection/71340149/inspiration", {
+        "count": ">= 145",
         "pattern": BehanceGalleryExtractor.pattern,
     })
 
@@ -168,12 +169,13 @@ class BehanceCollectionExtractor(BehanceExtractor):
 
     def galleries(self):
         url = "{}/collection/{}/a".format(self.root, self.collection_id)
+        params = {"offset": 0}
         headers = {"X-Requested-With": "XMLHttpRequest"}
-        params = {}
 
         while True:
-            data = self.request(url, headers=headers, params=params).json()
-            yield from data["output"]
-            if not data.get("offset"):
+            data = self.request(url, params=params, headers=headers).json()
+            for item in data["items"]:
+                yield item["project"]
+            if len(data["items"]) < 40:
                 return
-            params["offset"] = data["offset"]
+            params["offset"] += len(data["items"])
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index b10bd35..9cc6738 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -100,7 +100,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
     test = (
         ("https://dynasty-scans.com/images?with[]=4930&with[]=5211", {
             "url": "6b570eedd8a741c2cd34fb98b22a49d772f84191",
-            "keyword": "a1e2d05c1406a08b02f347389616a6babb1b50bf",
+            "keyword": "fa7ff94f82cdf942f7734741d758f160a6b0905a",
         }),
         ("https://dynasty-scans.com/images", {
             "range": "1",
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 20e0746..1833b1a 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -6,7 +6,7 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-"""Extract images from galleries at https://exhentai.org/"""
+"""Extractors for https://e-hentai.org/ and https://exhentai.org/"""
 
 from .common import Extractor, Message
 from .. import text, util, exception
@@ -23,16 +23,19 @@ BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
 class ExhentaiExtractor(Extractor):
     """Base class for exhentai extractors"""
     category = "exhentai"
-    directory_fmt = ("{category}", "{gallery_id}")
+    directory_fmt = ("{category}", "{gallery_id} {title}")
     filename_fmt = (
         "{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}")
     archive_fmt = "{gallery_id}_{num}"
-    cookiedomain = ".exhentai.org"
     cookienames = ("ipb_member_id", "ipb_pass_hash")
+    cookiedomain = ".exhentai.org"
     root = "https://exhentai.org"
 
+    LIMIT = False
+
     def __init__(self, match):
-        if match.group(1) != "ex":
+        version = match.group(1)
+        if version != "ex":
             self.root = "https://e-hentai.org"
             self.cookiedomain = ".e-hentai.org"
         Extractor.__init__(self, match)
@@ -45,6 +48,8 @@ class ExhentaiExtractor(Extractor):
         if self.wait_max < self.wait_min:
             self.wait_max = self.wait_min
         self.session.headers["Referer"] = self.root + "/"
+        if version != "ex":
+            self.session.cookies.set("nw", "1", domain=self.cookiedomain)
 
     def request(self, *args, **kwargs):
         response = Extractor.request(self, *args, **kwargs)
@@ -63,6 +68,9 @@ class ExhentaiExtractor(Extractor):
 
     def login(self):
         """Login and set necessary cookies"""
+        if self.LIMIT:
+            self.log.error("Image limit reached!")
+            raise exception.StopExtraction()
         if self._check_cookies(self.cookienames):
             return
         username, password = self._get_auth_info()
@@ -92,7 +100,7 @@ class ExhentaiExtractor(Extractor):
         }
 
         response = self.request(url, method="POST", headers=headers, data=data)
-        if "You are now logged in as:" not in response.text:
+        if b"You are now logged in as:" not in response.content:
             raise exception.AuthenticationError()
         return {c: response.cookies[c] for c in self.cookienames}
 
@@ -112,9 +120,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
                r"(?:/g/(\d+)/([\da-f]{10})"
                r"|/s/([\da-f]{10})/(\d+)-(\d+))")
     test = (
-        ("https://exhentai.org/g/960460/4f0e369d82/", {
-            "keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480",
-            "content": "493d759de534355c9f55f8e365565b62411de146",
+        ("https://exhentai.org/g/1200119/d55c44d3d0/", {
+            "keyword": "1b353fad00dff0665b1746cdd151ab5cc326df23",
+            "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff",
         }),
         ("https://exhentai.org/g/960461/4f0e369d82/", {
             "exception": exception.NotFoundError,
@@ -122,13 +130,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
         ("http://exhentai.org/g/962698/7f02358e00/", {
             "exception": exception.AuthorizationError,
         }),
-        ("https://exhentai.org/s/3957343c3b/960460-5", {
+        ("https://exhentai.org/s/f68367b4c8/1200119-3", {
             "count": 2,
         }),
-        ("https://e-hentai.org/s/3957343c3b/960460-5", {
+        ("https://e-hentai.org/s/f68367b4c8/1200119-3", {
             "count": 2,
         }),
-        ("https://g.e-hentai.org/g/960460/4f0e369d82/"),
+        ("https://g.e-hentai.org/g/1200119/d55c44d3d0/"),
     )
 
     def __init__(self, match):
@@ -143,14 +151,25 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
     def items(self):
         self.login()
 
+        if self.limits:
+            self._init_limits()
+
         if self.gallery_token:
             gpage = self._gallery_page()
             self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
+            if not self.image_token:
+                self.log.error("Failed to extract initial image token")
+                self.log.debug("Page content:\n%s", gpage)
+                return
             self.wait()
             ipage = self._image_page()
         else:
             ipage = self._image_page()
             part = text.extract(ipage, 'hentai.org/g/', '"')[0]
+            if not part:
+                self.log.error("Failed to extract gallery token")
+                self.log.debug("Page content:\n%s", ipage)
+                return
             self.gallery_token = part.split("/")[1]
             self.wait()
             gpage = self._gallery_page()
@@ -211,12 +230,17 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
         iurl = extr('<img id="img" src="', '"')
         orig = extr('hentai.org/fullimg.php', '"')
 
-        if self.original and orig:
-            url = self.root + "/fullimg.php" + text.unescape(orig)
-            data = self._parse_original_info(extr('ownload original', '<'))
-        else:
-            url = iurl
-            data = self._parse_image_info(url)
+        try:
+            if self.original and orig:
+                url = self.root + "/fullimg.php" + text.unescape(orig)
+                data = self._parse_original_info(extr('ownload original', '<'))
+            else:
+                url = iurl
+                data = self._parse_image_info(url)
+        except IndexError:
+            self.log.error("Unable to parse image info for '%s'", url)
+            self.log.debug("Page content:\n%s", page)
+            raise exception.StopExtraction()
 
         data["num"] = self.image_num
         data["image_token"] = self.key["start"] = extr('var startkey="', '";')
@@ -242,13 +266,18 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
             imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)
             origurl, pos = text.extract(page["i7"], '<a href="', '"')
 
-            if self.original and origurl:
-                url = text.unescape(origurl)
-                data = self._parse_original_info(
-                    text.extract(page["i7"], "ownload original", "<", pos)[0])
-            else:
-                url = imgurl
-                data = self._parse_image_info(url)
+            try:
+                if self.original and origurl:
+                    url = text.unescape(origurl)
+                    data = self._parse_original_info(text.extract(
+                        page["i7"], "ownload original", "<", pos)[0])
+                else:
+                    url = imgurl
+                    data = self._parse_image_info(url)
+            except IndexError:
+                self.log.error("Unable to parse image info for '%s'", url)
+                self.log.debug("Page content:\n%s", page)
+                raise exception.StopExtraction()
 
             data["num"] = request["page"]
             data["image_token"] = imgkey
@@ -266,6 +295,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
             raise exception.AuthorizationError()
         if page.startswith(("Key missing", "Gallery not found")):
             raise exception.NotFoundError("gallery")
+        if "hentai.org/mpv/" in page:
+            self.log.warning("Enabled Multi-Page Viewer is not supported")
         return page
 
     def _image_page(self):
@@ -277,17 +308,24 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
             raise exception.NotFoundError("image page")
         return page
 
+    def _init_limits(self):
+        self._update_limits()
+        if self._remaining <= 0:
+            self.log.error("Image limit reached!")
+            ExhentaiExtractor.LIMIT = True
+            raise exception.StopExtraction()
+
     def _check_limits(self, data):
-        if not self._remaining or data["num"] % 20 == 0:
+        if data["num"] % 20 == 0:
             self._update_limits()
         self._remaining -= data["cost"]
 
         if self._remaining <= 0:
             url = "{}/s/{}/{}-{}".format(
                 self.root, data["image_token"], self.gallery_id, data["num"])
-            self.log.error(
-                "Image limit reached! Reset it and continue with "
-                "'%s' as URL.", url)
+            self.log.error("Image limit reached! Continue with "
+                           "'%s' as URL after resetting it.", url)
+            ExhentaiExtractor.LIMIT = True
             raise exception.StopExtraction()
 
     def _update_limits(self):
@@ -301,6 +339,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
         page = self.request(url, cookies=cookies).text
         current, pos = text.extract(page, "<strong>", "</strong>")
         maximum, pos = text.extract(page, "<strong>", "</strong>", pos)
+        self.log.debug("Image Limits: %s/%s", current, maximum)
         self._remaining = text.parse_int(maximum) - text.parse_int(current)
 
     @staticmethod
@@ -330,7 +369,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
     subcategory = "search"
     pattern = BASE_PATTERN + r"/?\?(.*)$"
     test = (
-        ("https://exhentai.org/?f_search=touhou"),
+        ("https://e-hentai.org/?f_search=touhou"),
         (("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0"
           "&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0"
           "&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), {
@@ -372,7 +411,10 @@ class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
     subcategory = "favorite"
     pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?"
     test = (
-        ("https://exhentai.org/favorites.php"),
+        ("https://e-hentai.org/favorites.php", {
+            "count": 1,
+            "pattern": r"https?://e-hentai\.org/g/1200119/d55c44d3d0"
+        }),
         ("https://exhentai.org/favorites.php?favcat=1&f_search=touhou"
          "&f_apply=Search+Favorites"),
     )
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 15bd0a8..ce2e83b 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -30,6 +30,7 @@ class GelbooruExtractor(booru.XmlParserMixin,
             self.params.update({"page": "dapi", "s": "post", "q": "index"})
         else:
             self.items = self.items_noapi
+            self.session.cookies["fringeBenefits"] = "yup"
 
     def items_noapi(self):
         data = self.get_metadata()
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
new file mode 100644
index 0000000..442634b
--- /dev/null
+++ b/gallery_dl/extractor/imgbb.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://imgbb.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+import json
+
+
+class ImgbbExtractor(Extractor):
+    """Base class for imgbb extractors"""
+    category = "imgbb"
+    filename_fmt = "{title} {id}.{extension}"
+    archive_fmt = "{id}"
+    root = "https://imgbb.com"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.page_url = self.sort = None
+
+    def items(self):
+        self.login()
+        page = self.request(self.page_url, params={"sort": self.sort}).text
+        data = self.metadata(page)
+        first = True
+
+        yield Message.Version, 1
+        for img in self.images(page):
+            image = {
+                "id"       : img["url_viewer"].rpartition("/")[2],
+                "user"     : img["user"]["username"],
+                "title"    : text.unescape(img["title"]),
+                "url"      : img["image"]["url"],
+                "extension": img["image"]["extension"],
+                "size"     : text.parse_int(img["image"]["size"]),
+                "width"    : text.parse_int(img["width"]),
+                "height"   : text.parse_int(img["height"]),
+            }
+            image.update(data)
+            if first:
+                first = False
+                yield Message.Directory, data
+            yield Message.Url, image["url"], image
+
+    def login(self):
+        username, password = self._get_auth_info()
+        if username:
+            self._update_cookies(self._login_impl(username, password))
+
+    @cache(maxage=360*24*3600, keyarg=1)
+    def _login_impl(self, username, password):
+        self.log.info("Logging in as %s", username)
+
+        url = self.root + "/login"
+        page = self.request(url).text
+        token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0]
+
+        headers = {"Referer": url}
+        data = {
+            "auth_token"   : token,
+            "login-subject": username,
+            "password"     : password,
+        }
+        response = self.request(url, method="POST", headers=headers, data=data)
+
+        if not response.history:
+            raise exception.AuthenticationError()
+        return self.session.cookies
+
+    def _pagination(self, page, endpoint, params):
+        params["page"] = 2
+        data = None
+
+        while True:
+            for img in text.extract_iter(page, "data-object='", "'"):
+                yield json.loads(text.unquote(img))
+            if data:
+                if params["seek"] == data["seekEnd"]:
+                    return
+                params["seek"] = data["seekEnd"]
+                params["page"] += 1
+            data = self.request(endpoint, "POST", data=params).json()
+            page = data["html"]
+
+
+class ImgbbAlbumExtractor(ImgbbExtractor):
+    """Extractor for albums on imgbb.com"""
+    subcategory = "album"
+    directory_fmt = ("{category}", "{user}", "{album_name} {album_id}")
+    pattern = r"(?:https?://)?ibb\.co/album/([^/?&#]+)/?(?:\?([^#]+))?"
+    test = (
+        ("https://ibb.co/album/c6p5Yv", {
+            "range": "1-80",
+            "url": "8adaf0f7dfc19ff8bc4712c97f534af8b1e06412",
+            "keyword": "155b665a53e83d359e914cab7c69d5b829444d64",
+        }),
+        ("https://ibb.co/album/c6p5Yv?sort=title_asc", {
+            "range": "1-80",
+            "url": "d6c45041d5c8323c435b183a976f3fde2af7c547",
+            "keyword": "30c3262214e2044bbcf6bf2dee8e3ca7ebd62b71",
+        }),
+    )
+
+    def __init__(self, match):
+        ImgbbExtractor.__init__(self, match)
+        self.album_name = None
+        self.album_id = match.group(1)
+        self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
+        self.page_url = "https://ibb.co/album/" + self.album_id
+
+    def metadata(self, page):
+        album, pos = text.extract(page, '"og:title" content="', '"')
+        user , pos = text.extract(page, 'rel="author">', '<', pos)
+        return {
+            "album_id"  : self.album_id,
+            "album_name": text.unescape(album),
+            "user"      : user.lower(),
+        }
+
+    def images(self, page):
+        seek, pos = text.extract(page, 'data-seek="', '"')
+        tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)
+
+        return self._pagination(page, "https://ibb.co/json", {
+            "action"    : "list",
+            "list"      : "images",
+            "from"      : "album",
+            "sort"      : self.sort,
+            "albumid"   : self.album_id,
+            "seek"      : seek,
+            "auth_token": tokn,
+            "params_hidden[list]"   : "images",
+            "params_hidden[from]"   : "album",
+            "params_hidden[albumid]": self.album_id,
+        })
+
+
+class ImgbbUserExtractor(ImgbbExtractor):
+    """Extractor for user profiles in imgbb.com"""
+    subcategory = "user"
+    directory_fmt = ("{category}", "{user}")
+    pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$"
+    test = ("https://folkie.imgbb.com", {
+        "range": "1-80",
+        "pattern": r"https?://i\.ibb\.co/\w+/[^/?&#]+",
+    })
+
+    def __init__(self, match):
+        ImgbbExtractor.__init__(self, match)
+        self.user = match.group(1)
+        self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
+        self.page_url = "https://{}.imgbb.com/".format(self.user)
+
+    def metadata(self, page):
+        return {"user": self.user}
+
+    def images(self, page):
+        seek, pos = text.extract(page, 'data-seek="', '"')
+        tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)
+        user, pos = text.extract(page, '.obj.resource={"id":"', '"', pos)
+
+        return self._pagination(page, self.page_url + "json", {
+            "action"    : "list",
+            "list"      : "images",
+            "from"      : "user",
+            "sort"      : self.sort,
+            "seek"      : seek,
+            "userid"    : user,
+            "auth_token": tokn,
+            "params_hidden[userid]": user,
+            "params_hidden[from]"  : "user",
+        })
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
index 65ae843..879d38b 100644
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor):
     test = (
         ("https://luscious.net/albums/okinami-no-koigokoro_277031/", {
             "url": "7e4984a271a1072ac6483e4228a045895aff86f3",
-            "keyword": "c597c132834f4990f90bf5dee5de2a9d4ba263a4",
+            "keyword": "ab4e5b71583fd439b4c8012a642aa8b58d8d0758",
             "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3",
         }),
         ("https://luscious.net/albums/virgin-killer-sweater_282582/", {
diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py
index 8135a8a..f3608b2 100644
--- a/gallery_dl/extractor/ngomik.py
+++ b/gallery_dl/extractor/ngomik.py
@@ -44,7 +44,7 @@ class NgomikChapterExtractor(ChapterExtractor):
 
     @staticmethod
     def images(page):
-        readerarea = text.extract(page, 'id=readerarea', 'class=chnav')[0]
+        readerarea = text.extract(page, 'id="readerarea"', 'class="chnav"')[0]
         return [
             (text.unescape(url), None)
             for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea)
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 012cb8b..da9735e 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -283,9 +283,9 @@ class SankakuPostExtractor(SankakuExtractor):
         "options": (("tags", True),),
         "keyword": {
             "tags_artist": "bonocho",
-            "tags_copyright": "batman_(series) the_dark_knight",
-            "tags_medium": "sketch copyright_name",
             "tags_studio": "dc_comics",
+            "tags_medium": "sketch copyright_name",
+            "tags_copyright": str,
             "tags_character": str,
             "tags_general": str,
         },
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index 55eda9f..0189fc9 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -34,11 +34,11 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
     test = (
         ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", {
             "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d",
-            "keyword": "4b3b5766b277a5d0acbec90fa8f2343262b07efd",
+            "keyword": "bfe08310e7d9a572f568f6900e0ed0eb295aa2b3",
         }),
         ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", {
             "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
-            "keyword": "f47a416d680717855bbc3e4f0cd44479f61d9aa4",
+            "keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68",
         }),
     )
 
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index 03ee144..66ad431 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -65,7 +65,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
                 "uploader"  : "sehki",
                 "lang"      : "en",
                 "language"  : "English",
-                "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
+                "thumbnail" : "re:https?://www.tsumino.com/Image/Thumb/40996",
             },
         }),
         ("https://www.tsumino.com/Read/View/45834"),
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
new file mode 100644
index 0000000..639ec82
--- /dev/null
+++ b/gallery_dl/extractor/vsco.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://vsco.co/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co/([^/]+)"
+
+
+class VscoExtractor(Extractor):
+    """Base class for vsco extractors"""
+    category = "vsco"
+    root = "https://vsco.co"
+    directory_fmt = ("{category}", "{user}")
+    filename_fmt = "{id}.{extension}"
+    archive_fmt = "{id}"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.user = match.group(1).lower()
+
+    def items(self):
+        yield Message.Version, 1
+        yield Message.Directory, {"user": self.user}
+        for img in self.images():
+            url = "https://" + (img.get("video_url") or img["responsive_url"])
+            data = text.nameext_from_url(url, {
+                "id"    : img["_id"],
+                "user"  : self.user,
+                "grid"  : img["grid_name"],
+                "meta"  : img.get("image_meta") or {},
+                "tags"  : [tag["text"] for tag in img.get("tags") or ()],
+                "date"  : text.parse_timestamp(img["upload_date"] // 1000),
+                "video" : img["is_video"],
+                "width" : img["width"],
+                "height": img["height"],
+                "description": img["description"],
+            })
+            yield Message.Url, url, data
+
+    def images(self):
+        """Return an iterable with all relevant image objects"""
+
+    def _extract_preload_state(self, url):
+        page = self.request(url, notfound=self.subcategory).text
+        return json.loads(text.extract(page, "__PRELOADED_STATE__ = ", "<")[0])
+
+    def _pagination(self, url, params, token, key, extra):
+        headers = {
+            "Referer"          : "{}/{}".format(self.root, self.user),
+            "Authorization"    : "Bearer " + token,
+            "X-Client-Platform": "web",
+            "X-Client-Build"   : "1",
+        }
+
+        yield from map(self._transform_media, extra)
+
+        while True:
+            data = self.request(url, params=params, headers=headers).json()
+            if not data.get(key):
+                return
+            yield from data[key]
+            params["page"] += 1
+
+    @staticmethod
+    def _transform_media(media):
+        media["_id"] = media["id"]
+        media["is_video"] = media["isVideo"]
+        media["grid_name"] = media["gridName"]
+        media["upload_date"] = media["uploadDate"]
+        media["responsive_url"] = media["responsiveUrl"]
+        media["video_url"] = media.get("videoUrl")
+        media["image_meta"] = media.get("imageMeta")
+        return media
+
+
+class VscoUserExtractor(VscoExtractor):
+    """Extractor for images from a user on vsco.co"""
+    subcategory = "user"
+    pattern = BASE_PATTERN + r"/images/"
+    test = ("https://vsco.co/missuri/images/1", {
+        "range": "1-80",
+        "count": 80,
+        "pattern": r"https://im\.vsco\.co/[^/]+/[0-9a-f/]+/vsco\w+\.\w+",
+    })
+
+    def images(self):
+        url = "{}/{}/images/1".format(self.root, self.user)
+        data = self._extract_preload_state(url)
+
+        tkn = data["users"]["currentUser"]["tkn"]
+        sid = str(data["sites"]["siteByUsername"][self.user]["site"]["id"])
+
+        url = "{}/api/2.0/medias".format(self.root)
+        params = {"page": 2, "size": "30", "site_id": sid}
+        return self._pagination(url, params, tkn, "media", (
+            data["medias"]["byId"][mid]["media"]
+            for mid in data["medias"]["bySiteId"][sid]["medias"]["1"]
+        ))
+
+
+class VscoCollectionExtractor(VscoExtractor):
+    """Extractor for images from a collection on vsco.co"""
+    subcategory = "collection"
+    directory_fmt = ("{category}", "{user}", "collection")
+    archive_fmt = "c_{user}_{id}"
+    pattern = BASE_PATTERN + r"/collection/"
+    test = ("https://vsco.co/vsco/collection/1", {
+        "range": "1-80",
+        "count": 80,
+        "pattern": r"https://im\.vsco\.co/[^/]+/[0-9a-f/]+/vsco\w+\.\w+",
+    })
+
+    def images(self):
+        url = "{}/{}/collection/1".format(self.root, self.user)
+        data = self._extract_preload_state(url)
+
+        tkn = data["users"]["currentUser"]["tkn"]
+        cid = (data["sites"]["siteByUsername"][self.user]
+               ["site"]["siteCollectionId"])
+
+        url = "{}/api/2.0/collections/{}/medias".format(self.root, cid)
+        params = {"page": 2, "size": "20"}
+        return self._pagination(url, params, tkn, "medias", (
+            data["medias"]["byId"][mid]["media"]
+            for mid in data
+            ["collections"]["byCollectionId"][cid]["collection"]["1"]
+        ))
+
+
+class VscoImageExtractor(VscoExtractor):
+    """Extractor for individual images on vsco.co"""
+    subcategory = "image"
+    pattern = BASE_PATTERN + r"/media/([0-9a-fA-F]+)"
+    test = (
+        ("https://vsco.co/erenyildiz/media/5d34b93ef632433030707ce2", {
+            "url": "faa214d10f859f374ad91da3f7547d2439f5af08",
+            "content": "1394d070828d82078035f19a92f404557b56b83f",
+            "keyword": {
+                "id"    : "5d34b93ef632433030707ce2",
+                "user"  : "erenyildiz",
+                "grid"  : "erenyildiz",
+                "meta"  : dict,
+                "tags"  : list,
+                "date"  : "type:datetime",
+                "video" : False,
+                "width" : 1537,
+                "height": 1537,
+                "description": "re:Ni seviyorum. #vsco #vscox #vscochallenges",
+            },
+        }),
+        ("https://vsco.co/jimenalazof/media/5b4feec558f6c45c18c040fd", {
+            "url": "08e7eef3301756ce81206c0b47c1e9373756a74a",
+            "content": "e739f058d726ee42c51c180a505747972a7dfa47",
+            "keyword": {"video" : True},
+        }),
+    )
+
+    def __init__(self, match):
+        VscoExtractor.__init__(self, match)
+        self.media_id = match.group(2)
+
+    def images(self):
+        url = "{}/{}/media/{}".format(self.root, self.user, self.media_id)
+        data = self._extract_preload_state(url)
+        media = data["medias"]["byId"].popitem()[1]["media"]
+        return (self._transform_media(media),)
author	Unit 193 <unit193@ubuntu.com>	2019-08-04 17:52:59 -0400
committer	Unit 193 <unit193@ubuntu.com>	2019-08-04 17:52:59 -0400
commit	64ad8e7bd15df71ab1116eede414558631bcad32 (patch)
tree	7416e191aedce591087903a943198aed13fa0b26 /gallery_dl/extractor
parent	2a63a9c9b7032a76894c48ac4d9cea732fcaee49 (diff)