plexpy/lib/soupsieve/css_parser.py
dependabot[bot] a0170a6f3d
Bump beautifulsoup4 from 4.12.2 to 4.12.3 (#2267)
* Bump beautifulsoup4 from 4.12.2 to 4.12.3

Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.12.2 to 4.12.3.

---
updated-dependencies:
- dependency-name: beautifulsoup4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update beautifulsoup4==4.12.3

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
2024-03-24 15:26:22 -07:00

1281 lines
45 KiB
Python

"""CSS selector parser."""
from __future__ import annotations
import re
from functools import lru_cache
from . import util
from . import css_match as cm
from . import css_types as ct
from .util import SelectorSyntaxError
import warnings
from typing import Match, Any, Iterator, cast
UNICODE_REPLACEMENT_CHAR = 0xFFFD
# Simple pseudo classes that take no parameters
PSEUDO_SIMPLE = {
":any-link",
":empty",
":first-child",
":first-of-type",
":in-range",
":out-of-range",
":last-child",
":last-of-type",
":link",
":only-child",
":only-of-type",
":root",
':checked',
':default',
':disabled',
':enabled',
':indeterminate',
':optional',
':placeholder-shown',
':read-only',
':read-write',
':required',
':scope',
':defined'
}
# Supported, simple pseudo classes that match nothing in the Soup Sieve environment
PSEUDO_SIMPLE_NO_MATCH = {
':active',
':current',
':focus',
':focus-visible',
':focus-within',
':future',
':host',
':hover',
':local-link',
':past',
':paused',
':playing',
':target',
':target-within',
':user-invalid',
':visited'
}
# Complex pseudo classes that take selector lists
PSEUDO_COMPLEX = {
':contains',
':-soup-contains',
':-soup-contains-own',
':has',
':is',
':matches',
':not',
':where'
}
PSEUDO_COMPLEX_NO_MATCH = {
':current',
':host',
':host-context'
}
# Complex pseudo classes that take very specific parameters and are handled special
PSEUDO_SPECIAL = {
':dir',
':lang',
':nth-child',
':nth-last-child',
':nth-last-of-type',
':nth-of-type'
}
PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL
# Sub-patterns parts
# Whitespace
NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])'
WS = fr'(?:[ \t]|{NEWLINE})'
# Comments
COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
# Whitespace with comments included
WSC = fr'(?:{WS}|{COMMENTS})'
# CSS escapes
CSS_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$))'
CSS_STRING_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$|{NEWLINE}))'
# CSS Identifier
IDENTIFIER = fr'''
(?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})+|--)
(?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})*)
'''
# `nth` content
NTH = fr'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){WSC}*(?:[-+]){WSC}*(?:[0-9]+))?'
# Value: quoted string or identifier
VALUE = fr'''(?:"(?:\\(?:.|{NEWLINE})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{NEWLINE})|[^\\'\r\n\f]+)*?'|{IDENTIFIER}+)'''
# Attribute value comparison. `!=` is handled special as it is non-standard.
ATTR = fr'(?:{WSC}*(?P<cmp>[!~^|*$]?=){WSC}*(?P<value>{VALUE})(?:{WSC}*(?P<case>[is]))?)?{WSC}*\]'
# Selector patterns
# IDs (`#id`)
PAT_ID = fr'\#{IDENTIFIER}'
# Classes (`.class`)
PAT_CLASS = fr'\.{IDENTIFIER}'
# Prefix:Tag (`prefix|tag`)
PAT_TAG = fr'(?P<tag_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<tag_name>{IDENTIFIER}|\*)'
# Attributes (`[attr]`, `[attr=value]`, etc.)
PAT_ATTR = fr'\[{WSC}*(?P<attr_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<attr_name>{IDENTIFIER}){ATTR}'
# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
PAT_PSEUDO_CLASS = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)?'
# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
PAT_PSEUDO_CLASS_SPECIAL = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)'
# Custom pseudo class (`:--custom-pseudo`)
PAT_PSEUDO_CLASS_CUSTOM = fr'(?P<name>:(?=--){IDENTIFIER})'
# Closing pseudo group (`)`)
PAT_PSEUDO_CLOSE = fr'{WSC}*\)'
# Pseudo element (`::pseudo-element`)
PAT_PSEUDO_ELEMENT = fr':{PAT_PSEUDO_CLASS}'
# At rule (`@page`, etc.) (not supported)
PAT_AT_RULE = fr'@P{IDENTIFIER}'
# Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.)
PAT_PSEUDO_NTH_CHILD = fr'''
(?P<pseudo_nth_child>{PAT_PSEUDO_CLASS_SPECIAL}
(?P<nth_child>{NTH}|even|odd))(?:{WSC}*\)|(?P<of>{COMMENTS}*{WS}{WSC}*of{COMMENTS}*{WS}{WSC}*))
'''
# Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.)
PAT_PSEUDO_NTH_TYPE = fr'''
(?P<pseudo_nth_type>{PAT_PSEUDO_CLASS_SPECIAL}
(?P<nth_type>{NTH}|even|odd)){WSC}*\)
'''
# Pseudo class language (`:lang("*-de", en)`)
PAT_PSEUDO_LANG = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)'
# Pseudo class direction (`:dir(ltr)`)
PAT_PSEUDO_DIR = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<dir>ltr|rtl){WSC}*\)'
# Combining characters (`>`, `~`, ` `, `+`, `,`)
PAT_COMBINE = fr'{WSC}*?(?P<relation>[,+>~]|{WS}(?![,+>~])){WSC}*'
# Extra: Contains (`:contains(text)`)
PAT_PSEUDO_CONTAINS = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)'
# Regular expressions
# CSS escape pattern
RE_CSS_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WSC}?)|(\\[^\r\n\f])|(\\$))', re.I)
RE_CSS_STR_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WS}?)|(\\[^\r\n\f])|(\\$)|(\\{NEWLINE}))', re.I)
# Pattern to break up `nth` specifiers
RE_NTH = re.compile(fr'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){WSC}*(?P<s2>[-+]){WSC}*(?P<b>[0-9]+))?', re.I)
# Pattern to iterate multiple values.
RE_VALUES = re.compile(fr'(?:(?P<value>{VALUE})|(?P<split>{WSC}*,{WSC}*))', re.X)
# Whitespace checks
RE_WS = re.compile(WS)
RE_WS_BEGIN = re.compile(fr'^{WSC}*')
RE_WS_END = re.compile(fr'{WSC}*$')
RE_CUSTOM = re.compile(fr'^{PAT_PSEUDO_CLASS_CUSTOM}$', re.X)
# Constants
# List split token
COMMA_COMBINATOR = ','
# Relation token for descendant
WS_COMBINATOR = " "
# Parse flags
FLG_PSEUDO = 0x01
FLG_NOT = 0x02
FLG_RELATIVE = 0x04
FLG_DEFAULT = 0x08
FLG_HTML = 0x10
FLG_INDETERMINATE = 0x20
FLG_OPEN = 0x40
FLG_IN_RANGE = 0x80
FLG_OUT_OF_RANGE = 0x100
FLG_PLACEHOLDER_SHOWN = 0x200
FLG_FORGIVE = 0x400
# Maximum cached patterns to store
_MAXCACHE = 500
@lru_cache(maxsize=_MAXCACHE)
def _cached_css_compile(
pattern: str,
namespaces: ct.Namespaces | None,
custom: ct.CustomSelectors | None,
flags: int
) -> cm.SoupSieve:
"""Cached CSS compile."""
custom_selectors = process_custom(custom)
return cm.SoupSieve(
pattern,
CSSParser(
pattern,
custom=custom_selectors,
flags=flags
).process_selectors(),
namespaces,
custom,
flags
)
def _purge_cache() -> None:
"""Purge the cache."""
_cached_css_compile.cache_clear()
def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]:
"""Process custom."""
custom_selectors = {}
if custom is not None:
for key, value in custom.items():
name = util.lower(key)
if RE_CUSTOM.match(name) is None:
raise SelectorSyntaxError(f"The name '{name}' is not a valid custom pseudo-class name")
if name in custom_selectors:
raise KeyError(f"The custom selector '{name}' has already been registered")
custom_selectors[css_unescape(name)] = value
return custom_selectors
def css_unescape(content: str, string: bool = False) -> str:
"""
Unescape CSS value.
Strings allow for spanning the value on multiple strings by escaping a new line.
"""
def replace(m: Match[str]) -> str:
"""Replace with the appropriate substitute."""
if m.group(1):
codepoint = int(m.group(1)[1:], 16)
if codepoint == 0:
codepoint = UNICODE_REPLACEMENT_CHAR
value = chr(codepoint)
elif m.group(2):
value = m.group(2)[1:]
elif m.group(3):
value = '\ufffd'
else:
value = ''
return value
return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
def escape(ident: str) -> str:
"""Escape identifier."""
string = []
length = len(ident)
start_dash = length > 0 and ident[0] == '-'
if length == 1 and start_dash:
# Need to escape identifier that is a single `-` with no other characters
string.append(f'\\{ident}')
else:
for index, c in enumerate(ident):
codepoint = ord(c)
if codepoint == 0x00:
string.append('\ufffd')
elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
string.append(f'\\{codepoint:x} ')
elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39):
string.append(f'\\{codepoint:x} ')
elif (
codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or
(0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A)
):
string.append(c)
else:
string.append(f'\\{c}')
return ''.join(string)
class SelectorPattern:
"""Selector pattern."""
def __init__(self, name: str, pattern: str) -> None:
"""Initialize."""
self.name = name
self.re_pattern = re.compile(pattern, re.I | re.X | re.U)
def get_name(self) -> str:
"""Get name."""
return self.name
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
"""Match the selector."""
return self.re_pattern.match(selector, index)
class SpecialPseudoPattern(SelectorPattern):
"""Selector pattern."""
def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None:
"""Initialize."""
self.patterns = {}
for p in patterns:
name = p[0]
pattern = p[3](name, p[2])
for pseudo in p[1]:
self.patterns[pseudo] = pattern
self.matched_name = None # type: SelectorPattern | None
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
def get_name(self) -> str:
"""Get name."""
return '' if self.matched_name is None else self.matched_name.get_name()
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
"""Match the selector."""
pseudo = None
m = self.re_pseudo_name.match(selector, index)
if m:
name = util.lower(css_unescape(m.group('name')))
pattern = self.patterns.get(name)
if pattern:
pseudo = pattern.match(selector, index, flags)
if pseudo:
self.matched_name = pattern
return pseudo
class _Selector:
"""
Intermediate selector class.
This stores selector data for a compound selector as we are acquiring them.
Once we are done collecting the data for a compound selector, we freeze
the data in an object that can be pickled and hashed.
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize."""
self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None
self.ids = kwargs.get('ids', []) # type: list[str]
self.classes = kwargs.get('classes', []) # type: list[str]
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
self.relations = kwargs.get('relations', []) # type: list[_Selector]
self.rel_type = kwargs.get('rel_type', None) # type: str | None
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
self.flags = kwargs.get('flags', 0) # type: int
self.no_match = kwargs.get('no_match', False) # type: bool
def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList:
"""Freeze relation."""
if relations:
sel = relations[0]
sel.relations.extend(relations[1:])
return ct.SelectorList([sel.freeze()])
else:
return ct.SelectorList()
def freeze(self) -> ct.Selector | ct.SelectorNull:
"""Freeze self."""
if self.no_match:
return ct.SelectorNull()
else:
return ct.Selector(
self.tag,
tuple(self.ids),
tuple(self.classes),
tuple(self.attributes),
tuple(self.nth),
tuple(self.selectors),
self._freeze_relations(self.relations),
self.rel_type,
tuple(self.contains),
tuple(self.lang),
self.flags
)
def __str__(self) -> str: # pragma: no cover
"""String representation."""
return (
f'_Selector(tag={self.tag!r}, ids={self.ids!r}, classes={self.classes!r}, attributes={self.attributes!r}, '
f'nth={self.nth!r}, selectors={self.selectors!r}, relations={self.relations!r}, '
f'rel_type={self.rel_type!r}, contains={self.contains!r}, lang={self.lang!r}, flags={self.flags!r}, '
f'no_match={self.no_match!r})'
)
__repr__ = __str__
class CSSParser:
"""Parse CSS selectors."""
css_tokens = (
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
SpecialPseudoPattern(
(
(
"pseudo_contains",
(':contains', ':-soup-contains', ':-soup-contains-own'),
PAT_PSEUDO_CONTAINS,
SelectorPattern
),
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
)
),
SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS),
SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT),
SelectorPattern("at_rule", PAT_AT_RULE),
SelectorPattern("id", PAT_ID),
SelectorPattern("class", PAT_CLASS),
SelectorPattern("tag", PAT_TAG),
SelectorPattern("attribute", PAT_ATTR),
SelectorPattern("combine", PAT_COMBINE)
)
def __init__(
self,
selector: str,
custom: dict[str, str | ct.SelectorList] | None = None,
flags: int = 0
) -> None:
"""Initialize."""
self.pattern = selector.replace('\x00', '\ufffd')
self.flags = flags
self.debug = self.flags & util.DEBUG
self.custom = {} if custom is None else custom
def parse_attribute_selector(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Create attribute selector from the returned regex match."""
inverse = False
op = m.group('cmp')
case = util.lower(m.group('case')) if m.group('case') else None
ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
attr = css_unescape(m.group('attr_name'))
is_type = False
pattern2 = None
value = ''
if case:
flags = (re.I if case == 'i' else 0) | re.DOTALL
elif util.lower(attr) == 'type':
flags = re.I | re.DOTALL
is_type = True
else:
flags = re.DOTALL
if op:
if m.group('value').startswith(('"', "'")):
value = css_unescape(m.group('value')[1:-1], True)
else:
value = css_unescape(m.group('value'))
if not op:
# Attribute name
pattern = None
elif op.startswith('^'):
# Value start with
pattern = re.compile(r'^%s.*' % re.escape(value), flags)
elif op.startswith('$'):
# Value ends with
pattern = re.compile(r'.*?%s$' % re.escape(value), flags)
elif op.startswith('*'):
# Value contains
pattern = re.compile(r'.*?%s.*' % re.escape(value), flags)
elif op.startswith('~'):
# Value contains word within space separated list
# `~=` should match nothing if it is empty or contains whitespace,
# so if either of these cases is present, use `[^\s\S]` which cannot be matched.
value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value)
pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags)
elif op.startswith('|'):
# Value starts with word in dash separated list
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
else:
# Value matches
pattern = re.compile(r'^%s$' % re.escape(value), flags)
if op.startswith('!'):
# Equivalent to `:not([attr=value])`
inverse = True
if is_type and pattern:
pattern2 = re.compile(pattern.pattern)
# Append the attribute selector
sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2)
if inverse:
# If we are using `!=`, we need to nest the pattern under a `:not()`.
sub_sel = _Selector()
sub_sel.attributes.append(sel_attr)
not_list = ct.SelectorList([sub_sel.freeze()], True, False)
sel.selectors.append(not_list)
else:
sel.attributes.append(sel_attr)
has_selector = True
return has_selector
def parse_tag_pattern(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse tag pattern from regex match."""
prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
tag = css_unescape(m.group('tag_name'))
sel.tag = ct.SelectorTag(tag, prefix)
has_selector = True
return has_selector
def parse_pseudo_class_custom(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""
Parse custom pseudo class alias.
Compile custom selectors as we need them. When compiling a custom selector,
set it to `None` in the dictionary so we can avoid an infinite loop.
"""
pseudo = util.lower(css_unescape(m.group('name')))
selector = self.custom.get(pseudo)
if selector is None:
raise SelectorSyntaxError(
f"Undefined custom selector '{pseudo}' found at position {m.end(0)}",
self.pattern,
m.end(0)
)
if not isinstance(selector, ct.SelectorList):
del self.custom[pseudo]
selector = CSSParser(
selector, custom=self.custom, flags=self.flags
).process_selectors(flags=FLG_PSEUDO)
self.custom[pseudo] = selector
sel.selectors.append(selector)
has_selector = True
return has_selector
def parse_pseudo_class(
self,
sel: _Selector,
m: Match[str],
has_selector: bool,
iselector: Iterator[tuple[str, Match[str]]],
is_html: bool
) -> tuple[bool, bool]:
"""Parse pseudo class."""
complex_pseudo = False
pseudo = util.lower(css_unescape(m.group('name')))
if m.group('open'):
complex_pseudo = True
if complex_pseudo and pseudo in PSEUDO_COMPLEX:
has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0))
elif not complex_pseudo and pseudo in PSEUDO_SIMPLE:
if pseudo == ':root':
sel.flags |= ct.SEL_ROOT
elif pseudo == ':defined':
sel.flags |= ct.SEL_DEFINED
is_html = True
elif pseudo == ':scope':
sel.flags |= ct.SEL_SCOPE
elif pseudo == ':empty':
sel.flags |= ct.SEL_EMPTY
elif pseudo in (':link', ':any-link'):
sel.selectors.append(CSS_LINK)
elif pseudo == ':checked':
sel.selectors.append(CSS_CHECKED)
elif pseudo == ':default':
sel.selectors.append(CSS_DEFAULT)
elif pseudo == ':indeterminate':
sel.selectors.append(CSS_INDETERMINATE)
elif pseudo == ":disabled":
sel.selectors.append(CSS_DISABLED)
elif pseudo == ":enabled":
sel.selectors.append(CSS_ENABLED)
elif pseudo == ":required":
sel.selectors.append(CSS_REQUIRED)
elif pseudo == ":optional":
sel.selectors.append(CSS_OPTIONAL)
elif pseudo == ":read-only":
sel.selectors.append(CSS_READ_ONLY)
elif pseudo == ":read-write":
sel.selectors.append(CSS_READ_WRITE)
elif pseudo == ":in-range":
sel.selectors.append(CSS_IN_RANGE)
elif pseudo == ":out-of-range":
sel.selectors.append(CSS_OUT_OF_RANGE)
elif pseudo == ":placeholder-shown":
sel.selectors.append(CSS_PLACEHOLDER_SHOWN)
elif pseudo == ':first-child':
sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()))
elif pseudo == ':last-child':
sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()))
elif pseudo == ':first-of-type':
sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()))
elif pseudo == ':last-of-type':
sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()))
elif pseudo == ':only-child':
sel.nth.extend(
[
ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()),
ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())
]
)
elif pseudo == ':only-of-type':
sel.nth.extend(
[
ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()),
ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())
]
)
has_selector = True
elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH:
self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
sel.no_match = True
has_selector = True
elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH:
sel.no_match = True
has_selector = True
elif pseudo in PSEUDO_SUPPORTED:
raise SelectorSyntaxError(
f"Invalid syntax for pseudo class '{pseudo}'",
self.pattern,
m.start(0)
)
else:
raise NotImplementedError(
f"'{pseudo}' pseudo-class is not implemented at this time"
)
return has_selector, is_html
def parse_pseudo_nth(
self,
sel: _Selector,
m: Match[str],
has_selector: bool,
iselector: Iterator[tuple[str, Match[str]]]
) -> bool:
"""Parse `nth` pseudo."""
mdict = m.groupdict()
if mdict.get('pseudo_nth_child'):
postfix = '_child'
else:
postfix = '_type'
mdict['name'] = util.lower(css_unescape(mdict['name']))
content = util.lower(mdict.get('nth' + postfix))
if content == 'even':
# 2n
s1 = 2
s2 = 0
var = True
elif content == 'odd':
# 2n+1
s1 = 2
s2 = 1
var = True
else:
nth_parts = cast(Match[str], RE_NTH.match(content))
_s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
a = nth_parts.group('a')
var = a.endswith('n')
if a.startswith('n'):
_s1 += '1'
elif var:
_s1 += a[:-1]
else:
_s1 += a
_s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
if nth_parts.group('b'):
_s2 += nth_parts.group('b')
else:
_s2 = '0'
s1 = int(_s1, 10)
s2 = int(_s2, 10)
pseudo_sel = mdict['name']
if postfix == '_child':
if m.group('of'):
# Parse the rest of `of S`.
nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
else:
# Use default `*|*` for `of S`.
nth_sel = CSS_NTH_OF_S_DEFAULT
if pseudo_sel == ':nth-child':
sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel))
elif pseudo_sel == ':nth-last-child':
sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel))
else:
if pseudo_sel == ':nth-of-type':
sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList()))
elif pseudo_sel == ':nth-last-of-type':
sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList()))
has_selector = True
return has_selector
def parse_pseudo_open(
self,
sel: _Selector,
name: str,
has_selector: bool,
iselector: Iterator[tuple[str, Match[str]]],
index: int
) -> bool:
"""Parse pseudo with opening bracket."""
flags = FLG_PSEUDO | FLG_OPEN
if name == ':not':
flags |= FLG_NOT
elif name == ':has':
flags |= FLG_RELATIVE
elif name in (':where', ':is'):
flags |= FLG_FORGIVE
sel.selectors.append(self.parse_selectors(iselector, index, flags))
has_selector = True
return has_selector
def parse_has_combinator(
self,
sel: _Selector,
m: Match[str],
has_selector: bool,
selectors: list[_Selector],
rel_type: str,
index: int
) -> tuple[bool, _Selector, str]:
"""Parse combinator tokens."""
combinator = m.group('relation').strip()
if not combinator:
combinator = WS_COMBINATOR
if combinator == COMMA_COMBINATOR:
sel.rel_type = rel_type
selectors[-1].relations.append(sel)
rel_type = ":" + WS_COMBINATOR
selectors.append(_Selector())
else:
if has_selector:
# End the current selector and associate the leading combinator with this selector.
sel.rel_type = rel_type
selectors[-1].relations.append(sel)
elif rel_type[1:] != WS_COMBINATOR:
# It's impossible to have two whitespace combinators after each other as the patterns
# will gobble up trailing whitespace. It is also impossible to have a whitespace
# combinator after any other kind for the same reason. But we could have
# multiple non-whitespace combinators. So if the current combinator is not a whitespace,
# then we've hit the multiple combinator case, so we should fail.
raise SelectorSyntaxError(
f'The multiple combinators at position {index}',
self.pattern,
index
)
# Set the leading combinator for the next selector.
rel_type = ':' + combinator
sel = _Selector()
has_selector = False
return has_selector, sel, rel_type
def parse_combinator(
self,
sel: _Selector,
m: Match[str],
has_selector: bool,
selectors: list[_Selector],
relations: list[_Selector],
is_pseudo: bool,
is_forgive: bool,
index: int
) -> tuple[bool, _Selector]:
"""Parse combinator tokens."""
combinator = m.group('relation').strip()
if not combinator:
combinator = WS_COMBINATOR
if not has_selector:
if not is_forgive or combinator != COMMA_COMBINATOR:
raise SelectorSyntaxError(
f"The combinator '{combinator}' at position {index}, must have a selector before it",
self.pattern,
index
)
# If we are in a forgiving pseudo class, just make the selector a "no match"
if combinator == COMMA_COMBINATOR:
sel.no_match = True
del relations[:]
selectors.append(sel)
else:
if combinator == COMMA_COMBINATOR:
if not sel.tag and not is_pseudo:
# Implied `*`
sel.tag = ct.SelectorTag('*', None)
sel.relations.extend(relations)
selectors.append(sel)
del relations[:]
else:
sel.relations.extend(relations)
sel.rel_type = combinator
del relations[:]
relations.append(sel)
sel = _Selector()
has_selector = False
return has_selector, sel
def parse_class_id(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse HTML classes and ids."""
selector = m.group(0)
if selector.startswith('.'):
sel.classes.append(css_unescape(selector[1:]))
else:
sel.ids.append(css_unescape(selector[1:]))
has_selector = True
return has_selector
def parse_pseudo_contains(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse contains."""
pseudo = util.lower(css_unescape(m.group('name')))
if pseudo == ":contains":
warnings.warn( # noqa: B028
"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
FutureWarning
)
contains_own = pseudo == ":-soup-contains-own"
values = css_unescape(m.group('values'))
patterns = []
for token in RE_VALUES.finditer(values):
if token.group('split'):
continue
value = token.group('value')
if value.startswith(("'", '"')):
value = css_unescape(value[1:-1], True)
else:
value = css_unescape(value)
patterns.append(value)
sel.contains.append(ct.SelectorContains(patterns, contains_own))
has_selector = True
return has_selector
def parse_pseudo_lang(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse pseudo language."""
values = m.group('values')
patterns = []
for token in RE_VALUES.finditer(values):
if token.group('split'):
continue
value = token.group('value')
if value.startswith(('"', "'")):
value = css_unescape(value[1:-1], True)
else:
value = css_unescape(value)
patterns.append(value)
sel.lang.append(ct.SelectorLang(patterns))
has_selector = True
return has_selector
def parse_pseudo_dir(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse pseudo direction."""
value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
sel.flags |= value
has_selector = True
return has_selector
def parse_selectors(
self,
iselector: Iterator[tuple[str, Match[str]]],
index: int = 0,
flags: int = 0
) -> ct.SelectorList:
"""Parse selectors."""
# Initialize important variables
sel = _Selector()
selectors = []
has_selector = False
closed = False
relations = [] # type: list[_Selector]
rel_type = ":" + WS_COMBINATOR
# Setup various flags
is_open = bool(flags & FLG_OPEN)
is_pseudo = bool(flags & FLG_PSEUDO)
is_relative = bool(flags & FLG_RELATIVE)
is_not = bool(flags & FLG_NOT)
is_html = bool(flags & FLG_HTML)
is_default = bool(flags & FLG_DEFAULT)
is_indeterminate = bool(flags & FLG_INDETERMINATE)
is_in_range = bool(flags & FLG_IN_RANGE)
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
is_forgive = bool(flags & FLG_FORGIVE)
# Print out useful debug stuff
if self.debug: # pragma: no cover
if is_pseudo:
print(' is_pseudo: True')
if is_open:
print(' is_open: True')
if is_relative:
print(' is_relative: True')
if is_not:
print(' is_not: True')
if is_html:
print(' is_html: True')
if is_default:
print(' is_default: True')
if is_indeterminate:
print(' is_indeterminate: True')
if is_in_range:
print(' is_in_range: True')
if is_out_of_range:
print(' is_out_of_range: True')
if is_placeholder_shown:
print(' is_placeholder_shown: True')
if is_forgive:
print(' is_forgive: True')
# The algorithm for relative selectors require an initial selector in the selector list
if is_relative:
selectors.append(_Selector())
try:
while True:
key, m = next(iselector)
# Handle parts
if key == "at_rule":
raise NotImplementedError(f"At-rules found at position {m.start(0)}")
elif key == 'pseudo_class_custom':
has_selector = self.parse_pseudo_class_custom(sel, m, has_selector)
elif key == 'pseudo_class':
has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
elif key == 'pseudo_element':
raise NotImplementedError(f"Pseudo-element found at position {m.start(0)}")
elif key == 'pseudo_contains':
has_selector = self.parse_pseudo_contains(sel, m, has_selector)
elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
elif key == 'pseudo_lang':
has_selector = self.parse_pseudo_lang(sel, m, has_selector)
elif key == 'pseudo_dir':
has_selector = self.parse_pseudo_dir(sel, m, has_selector)
# Currently only supports HTML
is_html = True
elif key == 'pseudo_close':
if not has_selector:
if not is_forgive:
raise SelectorSyntaxError(
f"Expected a selector at position {m.start(0)}",
self.pattern,
m.start(0)
)
sel.no_match = True
if is_open:
closed = True
break
else:
raise SelectorSyntaxError(
f"Unmatched pseudo-class close at position {m.start(0)}",
self.pattern,
m.start(0)
)
elif key == 'combine':
if is_relative:
has_selector, sel, rel_type = self.parse_has_combinator(
sel, m, has_selector, selectors, rel_type, index
)
else:
has_selector, sel = self.parse_combinator(
sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index
)
elif key == 'attribute':
has_selector = self.parse_attribute_selector(sel, m, has_selector)
elif key == 'tag':
if has_selector:
raise SelectorSyntaxError(
f"Tag name found at position {m.start(0)} instead of at the start",
self.pattern,
m.start(0)
)
has_selector = self.parse_tag_pattern(sel, m, has_selector)
elif key in ('class', 'id'):
has_selector = self.parse_class_id(sel, m, has_selector)
index = m.end(0)
except StopIteration:
pass
# Handle selectors that are not closed
if is_open and not closed:
raise SelectorSyntaxError(
f"Unclosed pseudo-class at position {index}",
self.pattern,
index
)
# Cleanup completed selector piece
if has_selector:
if not sel.tag and not is_pseudo:
# Implied `*`
sel.tag = ct.SelectorTag('*', None)
if is_relative:
sel.rel_type = rel_type
selectors[-1].relations.append(sel)
else:
sel.relations.extend(relations)
del relations[:]
selectors.append(sel)
# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
elif is_forgive and (not selectors or not relations):
# Handle normal pseudo-classes with empty slots like `:is()` etc.
sel.no_match = True
del relations[:]
selectors.append(sel)
has_selector = True
if not has_selector:
# We will always need to finish a selector when `:has()` is used as it leads with combining.
# May apply to others as well.
raise SelectorSyntaxError(
f'Expected a selector at position {index}',
self.pattern,
index
)
# Some patterns require additional logic, such as default. We try to make these the
# last pattern, and append the appropriate flag to that selector which communicates
# to the matcher what additional logic is required.
if is_default:
selectors[-1].flags = ct.SEL_DEFAULT
if is_indeterminate:
selectors[-1].flags = ct.SEL_INDETERMINATE
if is_in_range:
selectors[-1].flags = ct.SEL_IN_RANGE
if is_out_of_range:
selectors[-1].flags = ct.SEL_OUT_OF_RANGE
if is_placeholder_shown:
selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
# Return selector list
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]:
"""Iterate selector tokens."""
# Ignore whitespace and comments at start and end of pattern
m = RE_WS_BEGIN.search(pattern)
index = m.end(0) if m else 0
m = RE_WS_END.search(pattern)
end = (m.start(0) - 1) if m else (len(pattern) - 1)
if self.debug: # pragma: no cover
print(f'## PARSING: {pattern!r}')
while index <= end:
m = None
for v in self.css_tokens:
m = v.match(pattern, index, self.flags)
if m:
name = v.get_name()
if self.debug: # pragma: no cover
print(f"TOKEN: '{name}' --> {m.group(0)!r} at position {m.start(0)}")
index = m.end(0)
yield name, m
break
if m is None:
c = pattern[index]
# If the character represents the start of one of the known selector types,
# throw an exception mentioning that the known selector type is in error;
# otherwise, report the invalid character.
if c == '[':
msg = f"Malformed attribute selector at position {index}"
elif c == '.':
msg = f"Malformed class selector at position {index}"
elif c == '#':
msg = f"Malformed id selector at position {index}"
elif c == ':':
msg = f"Malformed pseudo-class selector at position {index}"
else:
msg = f"Invalid character {c!r} position {index}"
raise SelectorSyntaxError(msg, self.pattern, index)
if self.debug: # pragma: no cover
print('## END PARSING')
def process_selectors(self, index: int = 0, flags: int = 0) -> ct.SelectorList:
"""Process selectors."""
return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
# Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern)
# A few patterns are order dependent as they use patterns previous compiled.
# CSS pattern for `:link` and `:any-link`
CSS_LINK = CSSParser(
'html|*:is(a, area)[href]'
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:checked`
CSS_CHECKED = CSSParser(
'''
html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:default` (must compile CSS_CHECKED first)
CSS_DEFAULT = CSSParser(
'''
:checked,
/*
This pattern must be at the end.
Special logic is applied to the last selector.
*/
html|form html|*:is(button, input)[type="submit"]
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT)
# CSS pattern for `:indeterminate`
CSS_INDETERMINATE = CSSParser(
'''
html|input[type="checkbox"][indeterminate],
html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]),
html|progress:not([value]),
/*
This pattern must be at the end.
Special logic is applied to the last selector.
*/
html|input[type="radio"][name]:not([name='']):not([checked])
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
# CSS pattern for `:disabled`
CSS_DISABLED = CSSParser(
'''
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
html|optgroup[disabled] > html|option,
html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
html|fieldset[disabled] >
html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:enabled`
CSS_ENABLED = CSSParser(
'''
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:required`
CSS_REQUIRED = CSSParser(
'html|*:is(input, textarea, select)[required]'
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:optional`
CSS_OPTIONAL = CSSParser(
'html|*:is(input, textarea, select):not([required])'
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:placeholder-shown`
CSS_PLACEHOLDER_SHOWN = CSSParser(
'''
html|input:is(
:not([type]),
[type=""],
[type=text],
[type=search],
[type=url],
[type=tel],
[type=email],
[type=password],
[type=number]
)[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
html|textarea[placeholder]:not([placeholder=''])
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
# CSS pattern default for `:nth-child` "of S" feature
CSS_NTH_OF_S_DEFAULT = CSSParser(
'*|*'
).process_selectors(flags=FLG_PSEUDO)
# CSS pattern for `:read-write` (CSS_DISABLED must be compiled first)
CSS_READ_WRITE = CSSParser(
'''
html|*:is(
textarea,
input:is(
:not([type]),
[type=""],
[type=text],
[type=search],
[type=url],
[type=tel],
[type=email],
[type=number],
[type=password],
[type=date],
[type=datetime-local],
[type=month],
[type=time],
[type=week]
)
):not([readonly], :disabled),
html|*:is([contenteditable=""], [contenteditable="true" i])
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:read-only`
CSS_READ_ONLY = CSSParser(
'''
html|*:not(:read-write)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:in-range`
CSS_IN_RANGE = CSSParser(
'''
html|input:is(
[type="date"],
[type="month"],
[type="week"],
[type="time"],
[type="datetime-local"],
[type="number"],
[type="range"]
):is(
[min],
[max]
)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML)
# CSS pattern for `:out-of-range`
CSS_OUT_OF_RANGE = CSSParser(
'''
html|input:is(
[type="date"],
[type="month"],
[type="week"],
[type="time"],
[type="datetime-local"],
[type="number"],
[type="range"]
):is(
[min],
[max]
)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML)