gallery_dl/extractor/cfake.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://cfake.com/"""

from .common import Extractor, Message
from .. import text

BASE_PATTERN = r"(?:https?://)?(?:www\.)?cfake\.com"


class CfakeExtractor(Extractor):
    """Base class for cfake extractors"""
    category = "cfake"
    root = "https://cfake.com"
    directory_fmt = ("{category}", "{type}", "{type_name} ({type_id})")
    filename_fmt = "{category}_{type_name}_{id}.{extension}"
    archive_fmt = "{id}"

    def items(self):
        type, type_name, type_id, sub_id, pnum = self.groups

        if type.endswith("ies"):
            type = type[:-3] + "y"

        kwdict = self.kwdict
        kwdict["type"] = type
        kwdict["type_id"] = text.parse_int(type_id)
        kwdict["type_name"] = text.unquote(type_name).replace("_", " ")
        kwdict["sub_id"] = text.parse_int(sub_id)
        kwdict["page"] = pnum = text.parse_int(pnum, 1)
        yield Message.Directory, "", {}

        base = f"{self.root}/images/{type}/{type_name}/{type_id}"
        if sub_id:
            base = f"{base}/{sub_id}"

        while True:
            url = base if pnum < 2 else f"{base}/p{pnum}"
            page = self.request(url).text

            # Extract and yield images
            num = 0
            for image in self._extract_images(page):
                num += 1
                image["num"] = num + (pnum - 1) * 50
                url = image["url"]
                yield Message.Url, url, text.nameext_from_url(url, image)

            # Check for next page
            if not num or not (pnum := self._check_pagination(page)):
                return
            kwdict["page"] = pnum

    def _extract_images(self, page):
        """Extract image URLs and metadata from a gallery page"""
        for item in text.extract_iter(
                page, '<a href="javascript:showimage(', '</div></div>'):

            # Extract image path from showimage call
            # Format: 'big.php?show=2025/filename.jpg&id_picture=...
            show_param = text.extr(item, "show=", "&")
            if not show_param:
                continue

            # Extract metadata
            picture_id = text.extr(item, "id_picture=", "&")
            name_param = text.extr(item, "p_name=", "'")

            # Extract date
            date = text.extr(item, 'id="date_vignette">', '</div>')

            # Extract rating
            rating_text = text.extr(item, 'class="current-rating"', '</li>')
            rating = text.extr(rating_text, 'width:', 'px')

            # Convert thumbnail path to full image path
            # show_param is like "2025/filename.jpg"
            image_url = f"{self.root}/medias/photos/{show_param}"

            yield {
                "url": image_url,
                "id": text.parse_int(picture_id) if picture_id else 0,
                "name": text.unescape(name_param) if name_param else "",
                "date": date,
                "rating": rating,
            }

    def _check_pagination(self, page):
        """Check if there are more pages and return next page number"""
        # Look for current page indicator
        # Format: id="num_page_current" ><a href=".../ p1">1</a>
        current_section = text.extr(
            page, 'id="num_page_current"', '</div>')
        if not current_section:
            return None

        # Extract current page number from the link text
        current_page_str = text.extr(current_section, '">', '</a>')
        if not current_page_str:
            return None

        current_page = text.parse_int(current_page_str)
        if not current_page:
            return None

        next_page = current_page + 1

        # Check if next page link exists anywhere in the page
        # Look for href="/images/.../pN" pattern
        if f'/p{next_page}"' in page or f'/p{next_page} ' in page:
            return next_page

        return None


class CfakeCelebrityExtractor(CfakeExtractor):
    """Extractor for celebrity image galleries from cfake.com"""
    subcategory = "celebrity"
    pattern = (BASE_PATTERN + r"/images/(celebrity)"
               r"/([^/?#]+)/(\d+)()(?:/p(\d+))?")
    example = "https://cfake.com/images/celebrity/NAME/123"


class CfakeCategoryExtractor(CfakeExtractor):
    """Extractor for category image galleries from cfake.com"""
    subcategory = "category"
    pattern = (BASE_PATTERN + r"/images/(categories)"
               r"/([^/?#]+)/(\d+)()(?:/p(\d+))?")
    example = "https://cfake.com/images/categories/NAME/123"


class CfakeCreatedExtractor(CfakeExtractor):
    """Extractor for 'created' image galleries from cfake.com"""
    subcategory = "created"
    pattern = (BASE_PATTERN + r"/images/(created)"
               r"/([^/?#]+)/(\d+)/(\d+)(?:/p(\d+))?")
    example = "https://cfake.com/images/created/NAME/12345/123"


class CfakeCountryExtractor(CfakeExtractor):
    """Extractor for country image galleries from cfake.com"""
    subcategory = "country"
    pattern = (BASE_PATTERN + r"/images/(country)"
               r"/([^/?#]+)/(\d+)/(\d+)(?:/p(\d+))?")
    example = "https://cfake.com/images/country/NAME/12345/123"