diff options
| author | 2020-03-16 23:20:15 -0400 | |
|---|---|---|
| committer | 2020-03-16 23:20:15 -0400 | |
| commit | e8cc000750de972384f2f34d02d42222b4018ae9 (patch) | |
| tree | 26eb0bacedff7480d29bafcf184ca529cf9f1d9f /gallery_dl/text.py | |
| parent | 4366125d2580982abb57bc65a26fc1fb8ef2a5df (diff) | |
New upstream version 1.13.2upstream/1.13.2
Diffstat (limited to 'gallery_dl/text.py')
| -rw-r--r-- | gallery_dl/text.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 72dad5b..a3f4e0a 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -15,6 +15,8 @@ import datetime import urllib.parse +HTML_RE = re.compile("<[^>]+>") + INVALID_XML_CHARS = ( "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12", @@ -39,7 +41,7 @@ def clean_xml(xmldata, repl=""): def remove_html(txt, repl=" ", sep=" "): """Remove html-tags from a string""" try: - txt = re.sub("<[^>]+>", repl, txt) + txt = HTML_RE.sub(repl, txt) except TypeError: return "" if sep: @@ -51,7 +53,7 @@ def split_html(txt, sep=None): """Split input string by html-tags""" try: return [ - x.strip() for x in re.split("<[^>]+>", txt) + x.strip() for x in HTML_RE.split(txt) if x and not x.isspace() ] except TypeError: |
