From 1466a391d1b4428ae244689933edfcbdb24dadd5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 2 Mar 2023 20:55:01 -0800
Subject: [PATCH] Bump bleach from 5.0.1 to 6.0.0 (#1979)

* Bump bleach from 5.0.1 to 6.0.0

Bumps [bleach](https://github.com/mozilla/bleach) from 5.0.1 to 6.0.0.
- [Release notes](https://github.com/mozilla/bleach/releases)
- [Changelog](https://github.com/mozilla/bleach/blob/main/CHANGES)
- [Commits](https://github.com/mozilla/bleach/compare/v5.0.1...v6.0.0)

---
updated-dependencies:
- dependency-name: bleach
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update bleach==6.0.0

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
---
 lib/bleach/__init__.py      |   6 +-
 lib/bleach/html5lib_shim.py | 245 ++++++++++++++++++------------------
 lib/bleach/linkifier.py     |  82 ++++++++++--
 lib/bleach/sanitizer.py     | 154 +++++++++++++----------
 requirements.txt            |   2 +-
 5 files changed, 291 insertions(+), 198 deletions(-)

diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py
index bbcc2e03..4e87eb80 100644
--- a/lib/bleach/__init__.py
+++ b/lib/bleach/__init__.py
@@ -11,9 +11,9 @@ from bleach.sanitizer import (
 
 
 # yyyymmdd
-__releasedate__ = "20220627"
+__releasedate__ = "20230123"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "5.0.1"
+__version__ = "6.0.0"
 
 
 __all__ = ["clean", "linkify"]
@@ -52,7 +52,7 @@ def clean(
 
     :arg str text: the text to clean
 
-    :arg list tags: allowed list of tags; defaults to
+    :arg set tags: set of allowed tags; defaults to
         ``bleach.sanitizer.ALLOWED_TAGS``
 
     :arg dict attributes: allowed attributes; can be a callable, list or dict;
diff --git a/lib/bleach/html5lib_shim.py b/lib/bleach/html5lib_shim.py
index d121953b..aa5189b1 100644
--- a/lib/bleach/html5lib_shim.py
+++ b/lib/bleach/html5lib_shim.py
@@ -38,6 +38,9 @@ from bleach._vendor.html5lib.filters.sanitizer import (
     allowed_protocols,
     allowed_css_properties,
     allowed_svg_properties,
+    attr_val_is_uri,
+    svg_attr_val_allows_ref,
+    svg_allow_local_href,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib.filters.sanitizer import (
     Filter as SanitizerFilter,
@@ -78,127 +81,129 @@ TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
 
 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
-HTML_TAGS = [
-    "a",
-    "abbr",
-    "address",
-    "area",
-    "article",
-    "aside",
-    "audio",
-    "b",
-    "base",
-    "bdi",
-    "bdo",
-    "blockquote",
-    "body",
-    "br",
-    "button",
-    "canvas",
-    "caption",
-    "cite",
-    "code",
-    "col",
-    "colgroup",
-    "data",
-    "datalist",
-    "dd",
-    "del",
-    "details",
-    "dfn",
-    "dialog",
-    "div",
-    "dl",
-    "dt",
-    "em",
-    "embed",
-    "fieldset",
-    "figcaption",
-    "figure",
-    "footer",
-    "form",
-    "h1",
-    "h2",
-    "h3",
-    "h4",
-    "h5",
-    "h6",
-    "head",
-    "header",
-    "hgroup",
-    "hr",
-    "html",
-    "i",
-    "iframe",
-    "img",
-    "input",
-    "ins",
-    "kbd",
-    "keygen",
-    "label",
-    "legend",
-    "li",
-    "link",
-    "map",
-    "mark",
-    "menu",
-    "meta",
-    "meter",
-    "nav",
-    "noscript",
-    "object",
-    "ol",
-    "optgroup",
-    "option",
-    "output",
-    "p",
-    "param",
-    "picture",
-    "pre",
-    "progress",
-    "q",
-    "rp",
-    "rt",
-    "ruby",
-    "s",
-    "samp",
-    "script",
-    "section",
-    "select",
-    "slot",
-    "small",
-    "source",
-    "span",
-    "strong",
-    "style",
-    "sub",
-    "summary",
-    "sup",
-    "table",
-    "tbody",
-    "td",
-    "template",
-    "textarea",
-    "tfoot",
-    "th",
-    "thead",
-    "time",
-    "title",
-    "tr",
-    "track",
-    "u",
-    "ul",
-    "var",
-    "video",
-    "wbr",
-]
+HTML_TAGS = frozenset(
+    (
+        "a",
+        "abbr",
+        "address",
+        "area",
+        "article",
+        "aside",
+        "audio",
+        "b",
+        "base",
+        "bdi",
+        "bdo",
+        "blockquote",
+        "body",
+        "br",
+        "button",
+        "canvas",
+        "caption",
+        "cite",
+        "code",
+        "col",
+        "colgroup",
+        "data",
+        "datalist",
+        "dd",
+        "del",
+        "details",
+        "dfn",
+        "dialog",
+        "div",
+        "dl",
+        "dt",
+        "em",
+        "embed",
+        "fieldset",
+        "figcaption",
+        "figure",
+        "footer",
+        "form",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "head",
+        "header",
+        "hgroup",
+        "hr",
+        "html",
+        "i",
+        "iframe",
+        "img",
+        "input",
+        "ins",
+        "kbd",
+        "keygen",
+        "label",
+        "legend",
+        "li",
+        "link",
+        "map",
+        "mark",
+        "menu",
+        "meta",
+        "meter",
+        "nav",
+        "noscript",
+        "object",
+        "ol",
+        "optgroup",
+        "option",
+        "output",
+        "p",
+        "param",
+        "picture",
+        "pre",
+        "progress",
+        "q",
+        "rp",
+        "rt",
+        "ruby",
+        "s",
+        "samp",
+        "script",
+        "section",
+        "select",
+        "slot",
+        "small",
+        "source",
+        "span",
+        "strong",
+        "style",
+        "sub",
+        "summary",
+        "sup",
+        "table",
+        "tbody",
+        "td",
+        "template",
+        "textarea",
+        "tfoot",
+        "th",
+        "thead",
+        "time",
+        "title",
+        "tr",
+        "track",
+        "u",
+        "ul",
+        "var",
+        "video",
+        "wbr",
+    )
+)
 
 
 #: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
 #: from mozilla on 2019.07.11
 #: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
 HTML_TAGS_BLOCK_LEVEL = frozenset(
-    [
+    (
         "address",
         "article",
         "aside",
@@ -232,7 +237,7 @@ HTML_TAGS_BLOCK_LEVEL = frozenset(
         "section",
         "table",
         "ul",
-    ]
+    )
 )
 
 
@@ -473,7 +478,7 @@ class BleachHTMLParser(HTMLParser):
 
     def __init__(self, tags, strip, consume_entities, **kwargs):
         """
-        :arg tags: list of allowed tags--everything else is either stripped or
+        :arg tags: set of allowed tags--everything else is either stripped or
             escaped; if None, then this doesn't look at tags at all
         :arg strip: whether to strip disallowed tags (True) or escape them (False);
             if tags=None, then this doesn't have any effect
@@ -481,7 +486,9 @@ class BleachHTMLParser(HTMLParser):
             leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
 
         """
-        self.tags = [tag.lower() for tag in tags] if tags is not None else None
+        self.tags = (
+            frozenset((tag.lower() for tag in tags)) if tags is not None else None
+        )
         self.strip = strip
         self.consume_entities = consume_entities
         super().__init__(**kwargs)
@@ -691,7 +698,7 @@ class BleachHTMLSerializer(HTMLSerializer):
                 # Only leave entities in that are not ambiguous. If they're
                 # ambiguous, then we escape the ampersand.
                 if entity is not None and convert_entity(entity) is not None:
-                    yield "&" + entity + ";"
+                    yield f"&{entity};"
 
                     # Length of the entity plus 2--one for & at the beginning
                     # and one for ; at the end
diff --git a/lib/bleach/linkifier.py b/lib/bleach/linkifier.py
index b3b83e62..679d7ead 100644
--- a/lib/bleach/linkifier.py
+++ b/lib/bleach/linkifier.py
@@ -120,9 +120,10 @@ class Linker:
         :arg list callbacks: list of callbacks to run when adjusting tag attributes;
             defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
 
-        :arg list skip_tags: list of tags that you don't want to linkify the
-            contents of; for example, you could set this to ``['pre']`` to skip
-            linkifying contents of ``pre`` tags
+        :arg set skip_tags: set of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``{'pre'}`` to skip
+            linkifying contents of ``pre`` tags; ``None`` means you don't
+            want linkify to skip any tags
 
         :arg bool parse_email: whether or not to linkify email addresses
 
@@ -130,7 +131,7 @@ class Linker:
 
         :arg email_re: email matching regex
 
-        :arg list recognized_tags: the list of tags that linkify knows about;
+        :arg set recognized_tags: the set of tags that linkify knows about;
             everything else gets escaped
 
         :returns: linkified text as unicode
@@ -145,15 +146,18 @@ class Linker:
         # Create a parser/tokenizer that allows all HTML tags and escapes
         # anything not in that list.
         self.parser = html5lib_shim.BleachHTMLParser(
-            tags=recognized_tags,
+            tags=frozenset(recognized_tags),
             strip=False,
-            consume_entities=True,
+            consume_entities=False,
             namespaceHTMLElements=False,
         )
         self.walker = html5lib_shim.getTreeWalker("etree")
         self.serializer = html5lib_shim.BleachHTMLSerializer(
             quote_attr_values="always",
             omit_optional_tags=False,
+            # We want to leave entities as they are without escaping or
+            # resolving or expanding
+            resolve_entities=False,
             # linkify does not sanitize
             sanitize=False,
             # linkify preserves attr order
@@ -218,8 +222,8 @@ class LinkifyFilter(html5lib_shim.Filter):
         :arg list callbacks: list of callbacks to run when adjusting tag attributes;
             defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
 
-        :arg list skip_tags: list of tags that you don't want to linkify the
-            contents of; for example, you could set this to ``['pre']`` to skip
+        :arg set skip_tags: set of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``{'pre'}`` to skip
             linkifying contents of ``pre`` tags
 
         :arg bool parse_email: whether or not to linkify email addresses
@@ -232,7 +236,7 @@ class LinkifyFilter(html5lib_shim.Filter):
         super().__init__(source)
 
         self.callbacks = callbacks or []
-        self.skip_tags = skip_tags or []
+        self.skip_tags = skip_tags or {}
         self.parse_email = parse_email
 
         self.url_re = url_re
@@ -510,6 +514,62 @@ class LinkifyFilter(html5lib_shim.Filter):
                 yield {"type": "Characters", "data": str(new_text)}
                 yield token_buffer[-1]
 
+    def extract_entities(self, token):
+        """Handles Characters tokens with entities
+
+        Our overridden tokenizer doesn't do anything with entities. However,
+        that means that the serializer will convert all ``&`` in Characters
+        tokens to ``&amp;``.
+
+        Since we don't want that, we extract entities here and convert them to
+        Entity tokens so the serializer will let them be.
+
+        :arg token: the Characters token to work on
+
+        :returns: generator of tokens
+
+        """
+        data = token.get("data", "")
+
+        # If there isn't a & in the data, we can return now
+        if "&" not in data:
+            yield token
+            return
+
+        new_tokens = []
+
+        # For each possible entity that starts with a "&", we try to extract an
+        # actual entity and re-tokenize accordingly
+        for part in html5lib_shim.next_possible_entity(data):
+            if not part:
+                continue
+
+            if part.startswith("&"):
+                entity = html5lib_shim.match_entity(part)
+                if entity is not None:
+                    if entity == "amp":
+                        # LinkifyFilter can't match urls across token boundaries
+                        # which is problematic with &amp; since that shows up in
+                        # querystrings all the time. This special-cases &amp;
+                        # and converts it to a & and sticks it in as a
+                        # Characters token. It'll get merged with surrounding
+                        # tokens in the BleachSanitizerfilter.__iter__ and
+                        # escaped in the serializer.
+                        new_tokens.append({"type": "Characters", "data": "&"})
+                    else:
+                        new_tokens.append({"type": "Entity", "name": entity})
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and one for ; at the end
+                    remainder = part[len(entity) + 2 :]
+                    if remainder:
+                        new_tokens.append({"type": "Characters", "data": remainder})
+                    continue
+
+            new_tokens.append({"type": "Characters", "data": part})
+
+        yield from new_tokens
+
     def __iter__(self):
         in_a = False
         in_skip_tag = None
@@ -564,8 +624,8 @@ class LinkifyFilter(html5lib_shim.Filter):
 
                 new_stream = self.handle_links(new_stream)
 
-                for token in new_stream:
-                    yield token
+                for new_token in new_stream:
+                    yield from self.extract_entities(new_token)
 
                 # We've already yielded this token, so continue
                 continue
diff --git a/lib/bleach/sanitizer.py b/lib/bleach/sanitizer.py
index 6527ac03..8662a879 100644
--- a/lib/bleach/sanitizer.py
+++ b/lib/bleach/sanitizer.py
@@ -8,21 +8,23 @@ from bleach import html5lib_shim
 from bleach import parse_shim
 
 
-#: List of allowed tags
-ALLOWED_TAGS = [
-    "a",
-    "abbr",
-    "acronym",
-    "b",
-    "blockquote",
-    "code",
-    "em",
-    "i",
-    "li",
-    "ol",
-    "strong",
-    "ul",
-]
+#: Set of allowed tags
+ALLOWED_TAGS = frozenset(
+    (
+        "a",
+        "abbr",
+        "acronym",
+        "b",
+        "blockquote",
+        "code",
+        "em",
+        "i",
+        "li",
+        "ol",
+        "strong",
+        "ul",
+    )
+)
 
 
 #: Map of allowed attributes by tag
@@ -33,7 +35,7 @@ ALLOWED_ATTRIBUTES = {
 }
 
 #: List of allowed protocols
-ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
+ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto"))
 
 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
 INVISIBLE_CHARACTERS = "".join(
@@ -48,6 +50,10 @@ INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICOD
 INVISIBLE_REPLACEMENT_CHAR = "?"
 
 
+class NoCssSanitizerWarning(UserWarning):
+    pass
+
+
 class Cleaner:
     """Cleaner for cleaning HTML fragments of malicious content
 
@@ -89,7 +95,7 @@ class Cleaner:
     ):
         """Initializes a Cleaner
 
-        :arg list tags: allowed list of tags; defaults to
+        :arg set tags: set of allowed tags; defaults to
             ``bleach.sanitizer.ALLOWED_TAGS``
 
         :arg dict attributes: allowed attributes; can be a callable, list or dict;
@@ -143,6 +149,25 @@ class Cleaner:
             alphabetical_attributes=False,
         )
 
+        if css_sanitizer is None:
+            # FIXME(willkg): this doesn't handle when attributes or an
+            # attributes value is a callable
+            attributes_values = []
+            if isinstance(attributes, list):
+                attributes_values = attributes
+
+            elif isinstance(attributes, dict):
+                attributes_values = []
+                for values in attributes.values():
+                    if isinstance(values, (list, tuple)):
+                        attributes_values.extend(values)
+
+            if "style" in attributes_values:
+                warnings.warn(
+                    "'style' attribute specified, but css_sanitizer not set.",
+                    category=NoCssSanitizerWarning,
+                )
+
     def clean(self, text):
         """Cleans text and returns sanitized result as unicode
 
@@ -155,9 +180,8 @@ class Cleaner:
         """
         if not isinstance(text, str):
             message = (
-                "argument cannot be of '{name}' type, must be of text type".format(
-                    name=text.__class__.__name__
-                )
+                f"argument cannot be of {text.__class__.__name__!r} type, "
+                + "must be of text type"
             )
             raise TypeError(message)
 
@@ -167,13 +191,11 @@ class Cleaner:
         dom = self.parser.parseFragment(text)
         filtered = BleachSanitizerFilter(
             source=self.walker(dom),
-            # Bleach-sanitizer-specific things
+            allowed_tags=self.tags,
             attributes=self.attributes,
-            strip_disallowed_elements=self.strip,
+            strip_disallowed_tags=self.strip,
             strip_html_comments=self.strip_comments,
             css_sanitizer=self.css_sanitizer,
-            # html5lib-sanitizer things
-            allowed_elements=self.tags,
             allowed_protocols=self.protocols,
         )
 
@@ -237,19 +259,21 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
     def __init__(
         self,
         source,
-        allowed_elements=ALLOWED_TAGS,
+        allowed_tags=ALLOWED_TAGS,
         attributes=ALLOWED_ATTRIBUTES,
         allowed_protocols=ALLOWED_PROTOCOLS,
-        strip_disallowed_elements=False,
+        attr_val_is_uri=html5lib_shim.attr_val_is_uri,
+        svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref,
+        svg_allow_local_href=html5lib_shim.svg_allow_local_href,
+        strip_disallowed_tags=False,
         strip_html_comments=True,
         css_sanitizer=None,
-        **kwargs,
     ):
         """Creates a BleachSanitizerFilter instance
 
         :arg source: html5lib TreeWalker stream as an html5lib TreeWalker
 
-        :arg list allowed_elements: allowed list of tags; defaults to
+        :arg set allowed_tags: set of allowed tags; defaults to
             ``bleach.sanitizer.ALLOWED_TAGS``
 
         :arg dict attributes: allowed attributes; can be a callable, list or dict;
@@ -258,8 +282,16 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
         :arg list allowed_protocols: allowed list of protocols for links; defaults
             to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
 
-        :arg bool strip_disallowed_elements: whether or not to strip disallowed
-            elements
+        :arg attr_val_is_uri: set of attributes that have URI values
+
+        :arg svg_attr_val_allows_ref: set of SVG attributes that can have
+            references
+
+        :arg svg_allow_local_href: set of SVG elements that can have local
+            hrefs
+
+        :arg bool strip_disallowed_tags: whether or not to strip disallowed
+            tags
 
         :arg bool strip_html_comments: whether or not to strip HTML comments
 
@@ -267,24 +299,24 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
             sanitizing style attribute values and style text; defaults to None
 
         """
-        self.attr_filter = attribute_filter_factory(attributes)
-        self.strip_disallowed_elements = strip_disallowed_elements
-        self.strip_html_comments = strip_html_comments
-        self.css_sanitizer = css_sanitizer
+        # NOTE(willkg): This is the superclass of
+        # html5lib.filters.sanitizer.Filter. We call this directly skipping the
+        # __init__ for html5lib.filters.sanitizer.Filter because that does
+        # things we don't need to do and kicks up the deprecation warning for
+        # using Sanitizer.
+        html5lib_shim.Filter.__init__(self, source)
 
-        # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
-        warnings.filterwarnings(
-            "ignore",
-            message="html5lib's sanitizer is deprecated",
-            category=DeprecationWarning,
-            module="bleach._vendor.html5lib",
-        )
-        return super().__init__(
-            source,
-            allowed_elements=allowed_elements,
-            allowed_protocols=allowed_protocols,
-            **kwargs,
-        )
+        self.allowed_tags = frozenset(allowed_tags)
+        self.allowed_protocols = frozenset(allowed_protocols)
+
+        self.attr_filter = attribute_filter_factory(attributes)
+        self.strip_disallowed_tags = strip_disallowed_tags
+        self.strip_html_comments = strip_html_comments
+
+        self.attr_val_is_uri = attr_val_is_uri
+        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+        self.css_sanitizer = css_sanitizer
+        self.svg_allow_local_href = svg_allow_local_href
 
     def sanitize_stream(self, token_iterator):
         for token in token_iterator:
@@ -354,10 +386,10 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
         """
         token_type = token["type"]
         if token_type in ["StartTag", "EndTag", "EmptyTag"]:
-            if token["name"] in self.allowed_elements:
+            if token["name"] in self.allowed_tags:
                 return self.allow_token(token)
 
-            elif self.strip_disallowed_elements:
+            elif self.strip_disallowed_tags:
                 return None
 
             else:
@@ -570,7 +602,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
     def disallowed_token(self, token):
         token_type = token["type"]
         if token_type == "EndTag":
-            token["data"] = "</%s>" % token["name"]
+            token["data"] = f"</{token['name']}>"
 
         elif token["data"]:
             assert token_type in ("StartTag", "EmptyTag")
@@ -586,25 +618,19 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
                 if ns is None or ns not in html5lib_shim.prefixes:
                     namespaced_name = name
                 else:
-                    namespaced_name = "{}:{}".format(html5lib_shim.prefixes[ns], name)
+                    namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}"
 
-                attrs.append(
-                    ' %s="%s"'
-                    % (
-                        namespaced_name,
-                        # NOTE(willkg): HTMLSerializer escapes attribute values
-                        # already, so if we do it here (like HTMLSerializer does),
-                        # then we end up double-escaping.
-                        v,
-                    )
-                )
-            token["data"] = "<{}{}>".format(token["name"], "".join(attrs))
+                # NOTE(willkg): HTMLSerializer escapes attribute values
+                # already, so if we do it here (like HTMLSerializer does),
+                # then we end up double-escaping.
+                attrs.append(f' {namespaced_name}="{v}"')
+            token["data"] = f"<{token['name']}{''.join(attrs)}>"
 
         else:
-            token["data"] = "<%s>" % token["name"]
+            token["data"] = f"<{token['name']}>"
 
         if token.get("selfClosing"):
-            token["data"] = token["data"][:-1] + "/>"
+            token["data"] = f"{token['data'][:-1]}/>"
 
         token["type"] = "Characters"
 
diff --git a/requirements.txt b/requirements.txt
index 73c9308b..799b09b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ backports.csv==1.0.7
 backports.functools-lru-cache==1.6.4
 backports.zoneinfo==0.2.1;python_version<"3.9"
 beautifulsoup4==4.11.1
-bleach==5.0.1
+bleach==6.0.0
 certifi==2022.12.7
 cheroot==9.0.0
 cherrypy==18.8.0