aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/text.py
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2025-07-31 01:22:07 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2025-07-31 01:22:07 -0400
commitd9539f96cc7ac112b7d8faad022190fbbc88c745 (patch)
tree471249d60b9202c00d7d82abec8b296fc881292e /gallery_dl/text.py
parent889fc15f272118bf277737b6fac29d3faeffc641 (diff)
parenta6e995c093de8aae2e91a0787281bb34c0b871eb (diff)
Update upstream source from tag 'upstream/1.30.2'
Update to upstream version '1.30.2' with Debian dir f0dcd28a671f8600479182ff128e05ba8904a0d8
Diffstat (limited to 'gallery_dl/text.py')
-rw-r--r--gallery_dl/text.py84
1 files changed, 58 insertions, 26 deletions
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index c1dde94..a7539ad 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,14 +8,29 @@
"""Collection of functions that work on strings/text"""
-import re
import sys
import html
import time
import datetime
import urllib.parse
+import re as re_module
-HTML_RE = re.compile("<[^>]+>")
+try:
+ re_compile = re_module._compiler.compile
+except AttributeError:
+ re_compile = re_module.sre_compile.compile
+
+HTML_RE = re_compile(r"<[^>]+>")
+PATTERN_CACHE = {}
+
+
+def re(pattern):
+ """Compile a regular expression pattern"""
+ try:
+ return PATTERN_CACHE[pattern]
+ except KeyError:
+ p = PATTERN_CACHE[pattern] = re_compile(pattern)
+ return p
def remove_html(txt, repl=" ", sep=" "):
@@ -47,8 +62,13 @@ def slugify(value):
Adapted from:
https://github.com/django/django/blob/master/django/utils/text.py
"""
- value = re.sub(r"[^\w\s-]", "", str(value).lower())
- return re.sub(r"[-\s]+", "-", value).strip("-_")
+ value = re(r"[^\w\s-]").sub("", str(value).lower())
+ return re(r"[-\s]+").sub("-", value).strip("-_")
+
+
+def sanitize_whitespace(value):
+ """Replace all whitespace characters with a single space"""
+ return re(r"\s+").sub(" ", value.strip())
def ensure_http_scheme(url, scheme="https://"):
@@ -100,7 +120,7 @@ def nameext_from_url(url, data=None):
return data
-def extract(txt, begin, end, pos=0):
+def extract(txt, begin, end, pos=None):
"""Extract the text between 'begin' and 'end' from 'txt'
Args:
@@ -125,7 +145,7 @@ def extract(txt, begin, end, pos=0):
last = txt.index(end, first)
return txt[first:last], last+len(end)
except Exception:
- return None, pos
+ return None, 0 if pos is None else pos
def extr(txt, begin, end, default=""):
@@ -137,17 +157,26 @@ def extr(txt, begin, end, default=""):
return default
-def rextract(txt, begin, end, pos=-1):
+def rextract(txt, begin, end, pos=None):
try:
lbeg = len(begin)
- first = txt.rindex(begin, 0, pos)
+ first = txt.rindex(begin, None, pos)
last = txt.index(end, first + lbeg)
return txt[first + lbeg:last], first
except Exception:
- return None, pos
+ return None, -1 if pos is None else pos
+
+
+def rextr(txt, begin, end, pos=None, default=""):
+ """Stripped-down version of 'rextract()'"""
+ try:
+ first = txt.rindex(begin, None, pos) + len(begin)
+ return txt[first:txt.index(end, first)]
+ except Exception:
+ return default
-def extract_all(txt, rules, pos=0, values=None):
+def extract_all(txt, rules, pos=None, values=None):
"""Calls extract for each rule and returns the result in a dict"""
if values is None:
values = {}
@@ -155,10 +184,10 @@ def extract_all(txt, rules, pos=0, values=None):
result, pos = extract(txt, begin, end, pos)
if key:
values[key] = result
- return values, pos
+ return values, 0 if pos is None else pos
-def extract_iter(txt, begin, end, pos=0):
+def extract_iter(txt, begin, end, pos=None):
"""Yield values that would be returned by repeated calls of extract()"""
try:
index = txt.index
@@ -173,7 +202,7 @@ def extract_iter(txt, begin, end, pos=0):
return
-def extract_from(txt, pos=0, default=""):
+def extract_from(txt, pos=None, default=""):
"""Returns a function object that extracts from 'txt'"""
def extr(begin, end, index=txt.index, txt=txt):
nonlocal pos
@@ -190,21 +219,22 @@ def extract_from(txt, pos=0, default=""):
def parse_unicode_escapes(txt):
"""Convert JSON Unicode escapes in 'txt' into actual characters"""
if "\\u" in txt:
- return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt)
+ return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt)
return txt
def _hex_to_char(match):
- return chr(int(match.group(1), 16))
+ return chr(int(match[1], 16))
def parse_bytes(value, default=0, suffixes="bkmgtp"):
"""Convert a bytes-amount ("500k", "2.5M", ...) to int"""
- try:
- last = value[-1].lower()
- except Exception:
+ if not value:
return default
+ value = str(value).strip()
+ last = value[-1].lower()
+
if last in suffixes:
mul = 1024 ** suffixes.index(last)
value = value[:-1]
@@ -279,12 +309,19 @@ def parse_query_list(qs, as_list=()):
else:
result[name] = [value]
elif name not in result:
- result[name] = unquote(value.replace("+", " "))
+ result[name] = value
except Exception:
pass
return result
+def build_query(params):
+ return "&".join([
+ f"{quote(name)}={quote(value)}"
+ for name, value in params.items()
+ ])
+
+
if sys.hexversion < 0x30c0000:
# Python <= 3.11
def parse_timestamp(ts, default=None):
@@ -307,12 +344,7 @@ else:
def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0):
"""Create a datetime object by parsing 'date_string'"""
try:
- if format.endswith("%z") and date_string[-3] == ":":
- # workaround for Python < 3.7: +00:00 -> +0000
- ds = date_string[:-3] + date_string[-2:]
- else:
- ds = date_string
- d = datetime.datetime.strptime(ds, format)
+ d = datetime.datetime.strptime(date_string, format)
o = d.utcoffset()
if o is not None:
# convert to naive UTC