Bump bleach from 5.0.0 to 5.0.1 (#1777)

* Bump bleach from 5.0.0 to 5.0.1 Bumps [bleach](https://github.com/mozilla/bleach) from 5.0.0 to 5.0.1. - [Release notes](https://github.com/mozilla/bleach/releases) - [Changelog](https://github.com/mozilla/bleach/blob/main/CHANGES) - [Commits](https://github.com/mozilla/bleach/commits) --- updated-dependencies: - dependency-name: bleach dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> * Update bleach==5.0.1 Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-03-12 04:35:40 -07:00 · 2022-11-12 17:11:49 -08:00 · 2022-11-12 17:11:49 -08:00 · d889e810f4
commit d889e810f4
parent 21f5fee403
6 changed files with 47 additions and 26 deletions
--- a/lib/bleach/init.py
+++ b/lib/bleach/init.py
@ -11,9 +11,9 @@ from bleach.sanitizer import (


 # yyyymmdd
-__releasedate__ = "20220407"
+__releasedate__ = "20220627"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "5.0.0"
+__version__ = "5.0.1"


 __all__ = ["clean", "linkify"]
--- a/lib/bleach/html5lib_shim.py
+++ b/lib/bleach/html5lib_shim.py
@ -385,7 +385,17 @@ class BleachHTMLTokenizer(HTMLTokenizer):
            yield token

        if last_error_token:
-            yield last_error_token
+            if last_error_token["data"] == "eof-in-tag-name":
+                # Handle the case where the text being parsed ends with <
+                # followed by a series of characters. It's treated as a tag
+                # name that abruptly ends, but we should treat that like
+                # character data
+                yield {
+                    "type": TAG_TOKEN_TYPE_CHARACTERS,
+                    "data": "<" + self.currentToken["name"],
+                }
+            else:
+                yield last_error_token

    def consumeEntity(self, allowedChar=None, fromAttribute=False):
        # If this tokenizer is set to consume entities, then we can let the
--- a/lib/bleach/linkifier.py
+++ b/lib/bleach/linkifier.py
@ -1,5 +1,7 @@
 import re

+from urllib.parse import quote
+
 from bleach import callbacks as linkify_callbacks
 from bleach import html5lib_shim

@ -124,11 +126,11 @@ class Linker:

        :arg bool parse_email: whether or not to linkify email addresses

-        :arg re url_re: url matching regex
+        :arg url_re: url matching regex

-        :arg re email_re: email matching regex
+        :arg email_re: email matching regex

-        :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
+        :arg list recognized_tags: the list of tags that linkify knows about;
            everything else gets escaped

        :returns: linkified text as unicode
@ -211,7 +213,7 @@ class LinkifyFilter(html5lib_shim.Filter):
    ):
        """Creates a LinkifyFilter instance

-        :arg TreeWalker source: stream
+        :arg source: stream as an html5lib TreeWalker

        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
@ -222,9 +224,9 @@ class LinkifyFilter(html5lib_shim.Filter):

        :arg bool parse_email: whether or not to linkify email addresses

-        :arg re url_re: url matching regex
+        :arg url_re: url matching regex

-        :arg re email_re: email matching regex
+        :arg email_re: email matching regex

        """
        super().__init__(source)
@ -298,10 +300,15 @@ class LinkifyFilter(html5lib_shim.Filter):
                            {"type": "Characters", "data": text[end : match.start()]}
                        )

+                    # URL-encode the "local-part" according to RFC6068
+                    parts = match.group(0).split("@")
+                    parts[0] = quote(parts[0])
+                    address = "@".join(parts)
+
                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
-                        (None, "href"): "mailto:%s" % match.group(0),
+                        (None, "href"): "mailto:%s" % address,
                        "_text": match.group(0),
                    }
                    attrs = self.apply_callbacks(attrs, True)
--- a/lib/bleach/parse_shim.py
+++ b/lib/bleach/parse_shim.py
@ -0,0 +1 @@
+from bleach._vendor.parse import urlparse  # noqa
--- a/lib/bleach/sanitizer.py
+++ b/lib/bleach/sanitizer.py
@ -2,10 +2,10 @@ from itertools import chain
 import re
 import warnings

-from bleach._vendor.parse import urlparse
 from xml.sax.saxutils import unescape

 from bleach import html5lib_shim
+from bleach import parse_shim


 #: List of allowed tags
@ -247,7 +247,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
    ):
        """Creates a BleachSanitizerFilter instance

-        :arg Treewalker source: stream
+        :arg source: html5lib TreeWalker stream as an html5lib TreeWalker

        :arg list allowed_elements: allowed list of tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``
@ -449,27 +449,27 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
        :returns: allowed value or None

        """
-        # NOTE(willkg): This transforms the value into one that's easier to
-        # match and verify, but shouldn't get returned since it's vastly
-        # different than the original value.
+        # NOTE(willkg): This transforms the value into a normalized one that's
+        # easier to match and verify, but shouldn't get returned since it's
+        # vastly different than the original value.

        # Convert all character entities in the value
-        new_value = html5lib_shim.convert_entities(value)
+        normalized_uri = html5lib_shim.convert_entities(value)

        # Nix backtick, space characters, and control characters
-        new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
+        normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)

        # Remove REPLACEMENT characters
-        new_value = new_value.replace("\ufffd", "")
+        normalized_uri = normalized_uri.replace("\ufffd", "")

        # Lowercase it--this breaks the value, but makes it easier to match
        # against
-        new_value = new_value.lower()
+        normalized_uri = normalized_uri.lower()

        try:
            # Drop attributes with uri values that have protocols that aren't
            # allowed
-            parsed = urlparse(new_value)
+            parsed = parse_shim.urlparse(normalized_uri)
        except ValueError:
            # URI is impossible to parse, therefore it's not allowed
            return None
@ -481,16 +481,19 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):

        else:
            # Allow uris that are just an anchor
-            if new_value.startswith("#"):
+            if normalized_uri.startswith("#"):
                return value

            # Handle protocols that urlparse doesn't recognize like "myprotocol"
-            if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
+            if (
+                ":" in normalized_uri
+                and normalized_uri.split(":")[0] in allowed_protocols
+            ):
                return value

-            # If there's no protocol/scheme specified, then assume it's "http"
-            # and see if that's allowed
-            if "http" in allowed_protocols:
+            # If there's no protocol/scheme specified, then assume it's "http" or
+            # "https" and see if that's allowed
+            if "http" in allowed_protocols or "https" in allowed_protocols:
                return value

        return None
--- a/requirements.txt
+++ b/requirements.txt
@ -5,7 +5,7 @@ backports.csv==1.0.7
 backports.functools-lru-cache==1.6.4
 backports.zoneinfo==0.2.1
 beautifulsoup4==4.11.1
-bleach==5.0.0
+bleach==5.0.1
 certifi==2022.9.24
 cheroot==8.6.0
 cherrypy==18.6.1
				`@ -0,0 +1 @@`
				`from bleach._vendor.parse import urlparse # noqa`