mirror of
https://github.com/clinton-hall/nzbToMedia.git
synced 2025-01-24 03:42:59 -08:00
56c6773c6b
Updates colorama to 0.4.6 Adds confuse version 1.7.0 Updates jellyfish to 0.9.0 Adds mediafile 0.10.1 Updates munkres to 1.1.4 Updates musicbrainzngs to 0.7.1 Updates mutagen to 1.46.0 Updates pyyaml to 6.0 Updates unidecode to 1.3.6
139 lines
4.1 KiB
Python
139 lines
4.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vi:tabstop=4:expandtab:sw=4
|
|
"""Transliterate Unicode text into plain 7-bit ASCII.
|
|
|
|
Example usage:
|
|
|
|
>>> from unidecode import unidecode
|
|
>>> unidecode("\u5317\u4EB0")
|
|
"Bei Jing "
|
|
|
|
The transliteration uses a straightforward map, and doesn't have alternatives
|
|
for the same character based on language, position, or anything else.
|
|
|
|
A standard string object will be returned. If you need bytes, use:
|
|
|
|
>>> unidecode("Κνωσός").encode("ascii")
|
|
b'Knosos'
|
|
"""
|
|
import warnings
|
|
from typing import Dict, Optional, Sequence
|
|
|
|
Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
|
|
|
|
class UnidecodeError(ValueError):
|
|
def __init__(self, message: str, index: Optional[int] = None) -> None:
|
|
"""Raised for Unidecode-related errors.
|
|
|
|
The index attribute contains the index of the character that caused
|
|
the error.
|
|
"""
|
|
super(UnidecodeError, self).__init__(message)
|
|
self.index = index
|
|
|
|
|
|
def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
|
"""Transliterate an Unicode object into an ASCII string
|
|
|
|
>>> unidecode("\u5317\u4EB0")
|
|
"Bei Jing "
|
|
|
|
This function first tries to convert the string using ASCII codec.
|
|
If it fails (because of non-ASCII characters), it falls back to
|
|
transliteration using the character tables.
|
|
|
|
This is approx. five times faster if the string only contains ASCII
|
|
characters, but slightly slower than unicode_expect_nonascii if
|
|
non-ASCII characters are present.
|
|
|
|
errors specifies what to do with characters that have not been
|
|
found in replacement tables. The default is 'ignore' which ignores
|
|
the character. 'strict' raises an UnidecodeError. 'replace'
|
|
substitutes the character with replace_str (default is '?').
|
|
'preserve' keeps the original character.
|
|
|
|
Note that if 'preserve' is used the returned string might not be
|
|
ASCII!
|
|
"""
|
|
|
|
try:
|
|
bytestring = string.encode('ASCII')
|
|
except UnicodeEncodeError:
|
|
pass
|
|
else:
|
|
return string
|
|
|
|
return _unidecode(string, errors, replace_str)
|
|
|
|
def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
|
"""Transliterate an Unicode object into an ASCII string
|
|
|
|
>>> unidecode("\u5317\u4EB0")
|
|
"Bei Jing "
|
|
|
|
See unidecode_expect_ascii.
|
|
"""
|
|
|
|
return _unidecode(string, errors, replace_str)
|
|
|
|
unidecode = unidecode_expect_ascii
|
|
|
|
def _get_repl_str(char: str) -> Optional[str]:
|
|
codepoint = ord(char)
|
|
|
|
if codepoint < 0x80:
|
|
# Already ASCII
|
|
return str(char)
|
|
|
|
if codepoint > 0xeffff:
|
|
# No data on characters in Private Use Area and above.
|
|
return None
|
|
|
|
if 0xd800 <= codepoint <= 0xdfff:
|
|
warnings.warn( "Surrogate character %r will be ignored. "
|
|
"You might be using a narrow Python build." % (char,),
|
|
RuntimeWarning, 2)
|
|
|
|
section = codepoint >> 8 # Chop off the last two hex digits
|
|
position = codepoint % 256 # Last two hex digits
|
|
|
|
try:
|
|
table = Cache[section]
|
|
except KeyError:
|
|
try:
|
|
mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
|
|
except ImportError:
|
|
# No data on this character
|
|
Cache[section] = None
|
|
return None
|
|
|
|
Cache[section] = table = mod.data
|
|
|
|
if table and len(table) > position:
|
|
return table[position]
|
|
else:
|
|
return None
|
|
|
|
def _unidecode(string: str, errors: str, replace_str:str) -> str:
|
|
retval = []
|
|
|
|
for index, char in enumerate(string):
|
|
repl = _get_repl_str(char)
|
|
|
|
if repl is None:
|
|
if errors == 'ignore':
|
|
repl = ''
|
|
elif errors == 'strict':
|
|
raise UnidecodeError('no replacement found for character %r '
|
|
'in position %d' % (char, index), index)
|
|
elif errors == 'replace':
|
|
repl = replace_str
|
|
elif errors == 'preserve':
|
|
repl = char
|
|
else:
|
|
raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
|
|
|
|
retval.append(repl)
|
|
|
|
return ''.join(retval)
|