summaryrefslogtreecommitdiffstats
path: root/gallery_dl/text.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/text.py')
-rw-r--r--gallery_dl/text.py47
1 files changed, 15 insertions, 32 deletions
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 8b06384..a6a9105 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,33 +10,11 @@
import re
import html
-import os.path
import datetime
import urllib.parse
-
HTML_RE = re.compile("<[^>]+>")
-INVALID_XML_CHARS = (
- "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07",
- "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12",
- "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a",
- "\x1b", "\x1c", "\x1d", "\x1e", "\x1f",
-)
-
-
-def clean_xml(xmldata, repl=""):
- """Replace/Remove invalid control characters in 'xmldata'"""
- if not isinstance(xmldata, str):
- try:
- xmldata = "".join(xmldata)
- except TypeError:
- return ""
- for char in INVALID_XML_CHARS:
- if char in xmldata:
- xmldata = xmldata.replace(char, repl)
- return xmldata
-
def remove_html(txt, repl=" ", sep=" "):
"""Remove html-tags from a string"""
@@ -49,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "):
return txt.strip()
-def split_html(txt, sep=None):
- """Split input string by html-tags"""
+def split_html(txt):
+ """Split input string by HTML tags"""
try:
return [
- x.strip() for x in HTML_RE.split(txt)
+ unescape(x).strip()
+ for x in HTML_RE.split(txt)
if x and not x.isspace()
]
except TypeError:
@@ -77,18 +56,22 @@ def filename_from_url(url):
def ext_from_url(url):
"""Extract the filename extension of an URL"""
- filename = filename_from_url(url)
- ext = os.path.splitext(filename)[1]
- return ext[1:].lower()
+ name, _, ext = filename_from_url(url).rpartition(".")
+ return ext.lower() if name else ""
def nameext_from_url(url, data=None):
"""Extract the last part of an URL and fill 'data' accordingly"""
if data is None:
data = {}
- name = unquote(filename_from_url(url))
- data["filename"], ext = os.path.splitext(name)
- data["extension"] = ext[1:].lower()
+
+ filename = unquote(filename_from_url(url))
+ name, _, ext = filename.rpartition(".")
+ if name:
+ data["filename"], data["extension"] = name, ext.lower()
+ else:
+ data["filename"], data["extension"] = filename, ""
+
return data