1 files changed, 58 insertions, 26 deletions
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index c1dde94..a7539ad 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -8,14 +8,29 @@
 
 """Collection of functions that work on strings/text"""
 
-import re
 import sys
 import html
 import time
 import datetime
 import urllib.parse
+import re as re_module
 
-HTML_RE = re.compile("<[^>]+>")
+try:
+    re_compile = re_module._compiler.compile
+except AttributeError:
+    re_compile = re_module.sre_compile.compile
+
+HTML_RE = re_compile(r"<[^>]+>")
+PATTERN_CACHE = {}
+
+
+def re(pattern):
+    """Compile a regular expression pattern"""
+    try:
+        return PATTERN_CACHE[pattern]
+    except KeyError:
+        p = PATTERN_CACHE[pattern] = re_compile(pattern)
+        return p
 
 
 def remove_html(txt, repl=" ", sep=" "):
@@ -47,8 +62,13 @@ def slugify(value):
     Adapted from:
     https://github.com/django/django/blob/master/django/utils/text.py
     """
-    value = re.sub(r"[^\w\s-]", "", str(value).lower())
-    return re.sub(r"[-\s]+", "-", value).strip("-_")
+    value = re(r"[^\w\s-]").sub("", str(value).lower())
+    return re(r"[-\s]+").sub("-", value).strip("-_")
+
+
+def sanitize_whitespace(value):
+    """Replace all whitespace characters with a single space"""
+    return re(r"\s+").sub(" ", value.strip())
 
 
 def ensure_http_scheme(url, scheme="https://"):
@@ -100,7 +120,7 @@ def nameext_from_url(url, data=None):
     return data
 
 
-def extract(txt, begin, end, pos=0):
+def extract(txt, begin, end, pos=None):
     """Extract the text between 'begin' and 'end' from 'txt'
 
     Args:
@@ -125,7 +145,7 @@ def extract(txt, begin, end, pos=0):
         last = txt.index(end, first)
         return txt[first:last], last+len(end)
     except Exception:
-        return None, pos
+        return None, 0 if pos is None else pos
 
 
 def extr(txt, begin, end, default=""):
@@ -137,17 +157,26 @@ def extr(txt, begin, end, default=""):
         return default
 
 
-def rextract(txt, begin, end, pos=-1):
+def rextract(txt, begin, end, pos=None):
     try:
         lbeg = len(begin)
-        first = txt.rindex(begin, 0, pos)
+        first = txt.rindex(begin, None, pos)
         last = txt.index(end, first + lbeg)
         return txt[first + lbeg:last], first
     except Exception:
-        return None, pos
+        return None, -1 if pos is None else pos
+
+
+def rextr(txt, begin, end, pos=None, default=""):
+    """Stripped-down version of 'rextract()'"""
+    try:
+        first = txt.rindex(begin, None, pos) + len(begin)
+        return txt[first:txt.index(end, first)]
+    except Exception:
+        return default
 
 
-def extract_all(txt, rules, pos=0, values=None):
+def extract_all(txt, rules, pos=None, values=None):
     """Calls extract for each rule and returns the result in a dict"""
     if values is None:
         values = {}
@@ -155,10 +184,10 @@ def extract_all(txt, rules, pos=0, values=None):
         result, pos = extract(txt, begin, end, pos)
         if key:
             values[key] = result
-    return values, pos
+    return values, 0 if pos is None else pos
 
 
-def extract_iter(txt, begin, end, pos=0):
+def extract_iter(txt, begin, end, pos=None):
     """Yield values that would be returned by repeated calls of extract()"""
     try:
         index = txt.index
@@ -173,7 +202,7 @@ def extract_iter(txt, begin, end, pos=0):
         return
 
 
-def extract_from(txt, pos=0, default=""):
+def extract_from(txt, pos=None, default=""):
     """Returns a function object that extracts from 'txt'"""
     def extr(begin, end, index=txt.index, txt=txt):
         nonlocal pos
@@ -190,21 +219,22 @@ def extract_from(txt, pos=0, default=""):
 def parse_unicode_escapes(txt):
     """Convert JSON Unicode escapes in 'txt' into actual characters"""
     if "\\u" in txt:
-        return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt)
+        return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt)
     return txt
 
 
 def _hex_to_char(match):
-    return chr(int(match.group(1), 16))
+    return chr(int(match[1], 16))
 
 
 def parse_bytes(value, default=0, suffixes="bkmgtp"):
     """Convert a bytes-amount ("500k", "2.5M", ...) to int"""
-    try:
-        last = value[-1].lower()
-    except Exception:
+    if not value:
         return default
 
+    value = str(value).strip()
+    last = value[-1].lower()
+
     if last in suffixes:
         mul = 1024 ** suffixes.index(last)
         value = value[:-1]
@@ -279,12 +309,19 @@ def parse_query_list(qs, as_list=()):
                     else:
                         result[name] = [value]
                 elif name not in result:
-                    result[name] = unquote(value.replace("+", " "))
+                    result[name] = value
     except Exception:
         pass
     return result
 
 
+def build_query(params):
+    return "&".join([
+        f"{quote(name)}={quote(value)}"
+        for name, value in params.items()
+    ])
+
+
 if sys.hexversion < 0x30c0000:
     # Python <= 3.11
     def parse_timestamp(ts, default=None):
@@ -307,12 +344,7 @@ else:
 def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0):
     """Create a datetime object by parsing 'date_string'"""
     try:
-        if format.endswith("%z") and date_string[-3] == ":":
-            # workaround for Python < 3.7: +00:00 -> +0000
-            ds = date_string[:-3] + date_string[-2:]
-        else:
-            ds = date_string
-        d = datetime.datetime.strptime(ds, format)
+        d = datetime.datetime.strptime(date_string, format)
         o = d.utcoffset()
         if o is not None:
             # convert to naive UTC