mirror of
https://github.com/clinton-hall/nzbToMedia.git
synced 2024-11-14 09:30:21 -08:00
56c6773c6b
Updates colorama to 0.4.6 Adds confuse version 1.7.0 Updates jellyfish to 0.9.0 Adds mediafile 0.10.1 Updates munkres to 1.1.4 Updates musicbrainzngs to 0.7.1 Updates mutagen to 1.46.0 Updates pyyaml to 6.0 Updates unidecode to 1.3.6
523 lines
13 KiB
Python
523 lines
13 KiB
Python
import unicodedata
|
|
from collections import defaultdict
|
|
from itertools import zip_longest
|
|
from .porter import Stemmer
|
|
|
|
|
|
def _normalize(s):
|
|
return unicodedata.normalize("NFKD", s)
|
|
|
|
|
|
def _check_type(s):
|
|
if not isinstance(s, str):
|
|
raise TypeError("expected str or unicode, got %s" % type(s).__name__)
|
|
|
|
|
|
def levenshtein_distance(s1, s2):
|
|
_check_type(s1)
|
|
_check_type(s2)
|
|
|
|
if s1 == s2:
|
|
return 0
|
|
rows = len(s1) + 1
|
|
cols = len(s2) + 1
|
|
|
|
if not s1:
|
|
return cols - 1
|
|
if not s2:
|
|
return rows - 1
|
|
|
|
prev = None
|
|
cur = range(cols)
|
|
for r in range(1, rows):
|
|
prev, cur = cur, [r] + [0] * (cols - 1)
|
|
for c in range(1, cols):
|
|
deletion = prev[c] + 1
|
|
insertion = cur[c - 1] + 1
|
|
edit = prev[c - 1] + (0 if s1[r - 1] == s2[c - 1] else 1)
|
|
cur[c] = min(edit, deletion, insertion)
|
|
|
|
return cur[-1]
|
|
|
|
|
|
def _jaro_winkler(s1, s2, long_tolerance, winklerize):
|
|
_check_type(s1)
|
|
_check_type(s2)
|
|
|
|
s1_len = len(s1)
|
|
s2_len = len(s2)
|
|
|
|
if not s1_len or not s2_len:
|
|
return 0.0
|
|
|
|
min_len = min(s1_len, s2_len)
|
|
search_range = max(s1_len, s2_len)
|
|
search_range = (search_range // 2) - 1
|
|
if search_range < 0:
|
|
search_range = 0
|
|
|
|
s1_flags = [False] * s1_len
|
|
s2_flags = [False] * s2_len
|
|
|
|
# looking only within search range, count & flag matched pairs
|
|
common_chars = 0
|
|
for i, s1_ch in enumerate(s1):
|
|
low = max(0, i - search_range)
|
|
hi = min(i + search_range, s2_len - 1)
|
|
for j in range(low, hi + 1):
|
|
if not s2_flags[j] and s2[j] == s1_ch:
|
|
s1_flags[i] = s2_flags[j] = True
|
|
common_chars += 1
|
|
break
|
|
|
|
# short circuit if no characters match
|
|
if not common_chars:
|
|
return 0.0
|
|
|
|
# count transpositions
|
|
k = trans_count = 0
|
|
for i, s1_f in enumerate(s1_flags):
|
|
if s1_f:
|
|
for j in range(k, s2_len):
|
|
if s2_flags[j]:
|
|
k = j + 1
|
|
break
|
|
if s1[i] != s2[j]:
|
|
trans_count += 1
|
|
trans_count //= 2
|
|
|
|
# adjust for similarities in nonmatched characters
|
|
common_chars = float(common_chars)
|
|
weight = (
|
|
(
|
|
common_chars / s1_len
|
|
+ common_chars / s2_len
|
|
+ (common_chars - trans_count) / common_chars
|
|
)
|
|
) / 3
|
|
|
|
# winkler modification: continue to boost if strings are similar
|
|
if winklerize and weight > 0.7:
|
|
# adjust for up to first 4 chars in common
|
|
j = min(min_len, 4)
|
|
i = 0
|
|
while i < j and s1[i] == s2[i]:
|
|
i += 1
|
|
if i:
|
|
weight += i * 0.1 * (1.0 - weight)
|
|
|
|
# optionally adjust for long strings
|
|
# after agreeing beginning chars, at least two or more must agree and
|
|
# agreed characters must be > half of remaining characters
|
|
if (
|
|
long_tolerance
|
|
and min_len > 4
|
|
and common_chars > i + 1
|
|
and 2 * common_chars >= min_len + i
|
|
):
|
|
weight += (1.0 - weight) * (
|
|
float(common_chars - i - 1) / float(s1_len + s2_len - i * 2 + 2)
|
|
)
|
|
|
|
return weight
|
|
|
|
|
|
def jaro_similarity(s1, s2):
|
|
return _jaro_winkler(s1, s2, False, False) # noqa
|
|
|
|
|
|
def jaro_winkler_similarity(s1, s2, long_tolerance=False):
|
|
return _jaro_winkler(s1, s2, long_tolerance, True) # noqa
|
|
|
|
|
|
def damerau_levenshtein_distance(s1, s2):
|
|
_check_type(s1)
|
|
_check_type(s2)
|
|
|
|
len1 = len(s1)
|
|
len2 = len(s2)
|
|
infinite = len1 + len2
|
|
|
|
# character array
|
|
da = defaultdict(int)
|
|
|
|
# distance matrix
|
|
score = [[0] * (len2 + 2) for x in range(len1 + 2)]
|
|
|
|
score[0][0] = infinite
|
|
for i in range(0, len1 + 1):
|
|
score[i + 1][0] = infinite
|
|
score[i + 1][1] = i
|
|
for i in range(0, len2 + 1):
|
|
score[0][i + 1] = infinite
|
|
score[1][i + 1] = i
|
|
|
|
for i in range(1, len1 + 1):
|
|
db = 0
|
|
for j in range(1, len2 + 1):
|
|
i1 = da[s2[j - 1]]
|
|
j1 = db
|
|
cost = 1
|
|
if s1[i - 1] == s2[j - 1]:
|
|
cost = 0
|
|
db = j
|
|
|
|
score[i + 1][j + 1] = min(
|
|
score[i][j] + cost,
|
|
score[i + 1][j] + 1,
|
|
score[i][j + 1] + 1,
|
|
score[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1),
|
|
)
|
|
da[s1[i - 1]] = i
|
|
|
|
return score[len1 + 1][len2 + 1]
|
|
|
|
|
|
def soundex(s):
|
|
|
|
_check_type(s)
|
|
|
|
if not s:
|
|
return ""
|
|
|
|
s = _normalize(s)
|
|
s = s.upper()
|
|
|
|
replacements = (
|
|
("BFPV", "1"),
|
|
("CGJKQSXZ", "2"),
|
|
("DT", "3"),
|
|
("L", "4"),
|
|
("MN", "5"),
|
|
("R", "6"),
|
|
)
|
|
result = [s[0]]
|
|
count = 1
|
|
|
|
# find would-be replacement for first character
|
|
for lset, sub in replacements:
|
|
if s[0] in lset:
|
|
last = sub
|
|
break
|
|
else:
|
|
last = None
|
|
|
|
for letter in s[1:]:
|
|
for lset, sub in replacements:
|
|
if letter in lset:
|
|
if sub != last:
|
|
result.append(sub)
|
|
count += 1
|
|
last = sub
|
|
break
|
|
else:
|
|
if letter != "H" and letter != "W":
|
|
# leave last alone if middle letter is H or W
|
|
last = None
|
|
if count == 4:
|
|
break
|
|
|
|
result += "0" * (4 - count)
|
|
return "".join(result)
|
|
|
|
|
|
def hamming_distance(s1, s2):
|
|
_check_type(s1)
|
|
_check_type(s2)
|
|
|
|
# ensure length of s1 >= s2
|
|
if len(s2) > len(s1):
|
|
s1, s2 = s2, s1
|
|
|
|
# distance is difference in length + differing chars
|
|
distance = len(s1) - len(s2)
|
|
for i, c in enumerate(s2):
|
|
if c != s1[i]:
|
|
distance += 1
|
|
|
|
return distance
|
|
|
|
|
|
def nysiis(s):
|
|
|
|
_check_type(s)
|
|
|
|
if not s:
|
|
return ""
|
|
|
|
s = s.upper()
|
|
key = []
|
|
|
|
# step 1 - prefixes
|
|
if s.startswith("MAC"):
|
|
s = "MCC" + s[3:]
|
|
elif s.startswith("KN"):
|
|
s = s[1:]
|
|
elif s.startswith("K"):
|
|
s = "C" + s[1:]
|
|
elif s.startswith(("PH", "PF")):
|
|
s = "FF" + s[2:]
|
|
elif s.startswith("SCH"):
|
|
s = "SSS" + s[3:]
|
|
|
|
# step 2 - suffixes
|
|
if s.endswith(("IE", "EE")):
|
|
s = s[:-2] + "Y"
|
|
elif s.endswith(("DT", "RT", "RD", "NT", "ND")):
|
|
s = s[:-2] + "D"
|
|
|
|
# step 3 - first character of key comes from name
|
|
key.append(s[0])
|
|
|
|
# step 4 - translate remaining chars
|
|
i = 1
|
|
len_s = len(s)
|
|
while i < len_s:
|
|
ch = s[i]
|
|
if ch == "E" and i + 1 < len_s and s[i + 1] == "V":
|
|
ch = "AF"
|
|
i += 1
|
|
elif ch in "AEIOU":
|
|
ch = "A"
|
|
elif ch == "Q":
|
|
ch = "G"
|
|
elif ch == "Z":
|
|
ch = "S"
|
|
elif ch == "M":
|
|
ch = "N"
|
|
elif ch == "K":
|
|
if i + 1 < len(s) and s[i + 1] == "N":
|
|
ch = "N"
|
|
else:
|
|
ch = "C"
|
|
elif ch == "S" and s[i + 1 : i + 3] == "CH":
|
|
ch = "SS"
|
|
i += 2
|
|
elif ch == "P" and i + 1 < len(s) and s[i + 1] == "H":
|
|
ch = "F"
|
|
i += 1
|
|
elif ch == "H" and (
|
|
s[i - 1] not in "AEIOU"
|
|
or (i + 1 < len(s) and s[i + 1] not in "AEIOU")
|
|
or (i + 1 == len(s))
|
|
):
|
|
if s[i - 1] in "AEIOU":
|
|
ch = "A"
|
|
else:
|
|
ch = s[i - 1]
|
|
elif ch == "W" and s[i - 1] in "AEIOU":
|
|
ch = s[i - 1]
|
|
|
|
if ch[-1] != key[-1][-1]:
|
|
key.append(ch)
|
|
|
|
i += 1
|
|
|
|
key = "".join(key)
|
|
|
|
# step 5 - remove trailing S
|
|
if key.endswith("S") and key != "S":
|
|
key = key[:-1]
|
|
|
|
# step 6 - replace AY w/ Y
|
|
if key.endswith("AY"):
|
|
key = key[:-2] + "Y"
|
|
|
|
# step 7 - remove trailing A
|
|
if key.endswith("A") and key != "A":
|
|
key = key[:-1]
|
|
|
|
# step 8 was already done
|
|
|
|
return key
|
|
|
|
|
|
def match_rating_codex(s):
|
|
_check_type(s)
|
|
|
|
# we ignore spaces
|
|
s = s.upper().replace(" ", "")
|
|
codex = []
|
|
|
|
prev = None
|
|
first = True
|
|
for c in s:
|
|
# starting character
|
|
# or consonant not preceded by same consonant
|
|
if first or (c not in "AEIOU" and c != prev):
|
|
codex.append(c)
|
|
|
|
prev = c
|
|
first = False
|
|
|
|
# just use first/last 3
|
|
if len(codex) > 6:
|
|
return "".join(codex[:3] + codex[-3:])
|
|
else:
|
|
return "".join(codex)
|
|
|
|
|
|
def match_rating_comparison(s1, s2):
|
|
codex1 = match_rating_codex(s1)
|
|
codex2 = match_rating_codex(s2)
|
|
len1 = len(codex1)
|
|
len2 = len(codex2)
|
|
res1 = []
|
|
res2 = []
|
|
|
|
# length differs by 3 or more, no result
|
|
if abs(len1 - len2) >= 3:
|
|
return None
|
|
|
|
# get minimum rating based on sums of codexes
|
|
lensum = len1 + len2
|
|
if lensum <= 4:
|
|
min_rating = 5
|
|
elif lensum <= 7:
|
|
min_rating = 4
|
|
elif lensum <= 11:
|
|
min_rating = 3
|
|
else:
|
|
min_rating = 2
|
|
|
|
# strip off common prefixes
|
|
for c1, c2 in zip_longest(codex1, codex2):
|
|
if c1 != c2:
|
|
if c1:
|
|
res1.append(c1)
|
|
if c2:
|
|
res2.append(c2)
|
|
|
|
unmatched_count1 = unmatched_count2 = 0
|
|
for c1, c2 in zip_longest(reversed(res1), reversed(res2)):
|
|
if c1 != c2:
|
|
if c1:
|
|
unmatched_count1 += 1
|
|
if c2:
|
|
unmatched_count2 += 1
|
|
|
|
return (6 - max(unmatched_count1, unmatched_count2)) >= min_rating
|
|
|
|
|
|
def metaphone(s):
|
|
_check_type(s)
|
|
|
|
result = []
|
|
|
|
s = _normalize(s.lower())
|
|
|
|
# skip first character if s starts with these
|
|
if s.startswith(("kn", "gn", "pn", "wr", "ae")):
|
|
s = s[1:]
|
|
|
|
i = 0
|
|
|
|
while i < len(s):
|
|
c = s[i]
|
|
next = s[i + 1] if i < len(s) - 1 else "*****"
|
|
nextnext = s[i + 2] if i < len(s) - 2 else "*****"
|
|
|
|
# skip doubles except for cc
|
|
if c == next and c != "c":
|
|
i += 1
|
|
continue
|
|
|
|
if c in "aeiou":
|
|
if i == 0 or s[i - 1] == " ":
|
|
result.append(c)
|
|
elif c == "b":
|
|
if (not (i != 0 and s[i - 1] == "m")) or next:
|
|
result.append("b")
|
|
elif c == "c":
|
|
if next == "i" and nextnext == "a" or next == "h":
|
|
result.append("x")
|
|
i += 1
|
|
elif next in "iey":
|
|
result.append("s")
|
|
i += 1
|
|
else:
|
|
result.append("k")
|
|
elif c == "d":
|
|
if next == "g" and nextnext in "iey":
|
|
result.append("j")
|
|
i += 2
|
|
else:
|
|
result.append("t")
|
|
elif c in "fjlmnr":
|
|
result.append(c)
|
|
elif c == "g":
|
|
if next in "iey":
|
|
result.append("j")
|
|
elif next == "h" and nextnext and nextnext not in "aeiou":
|
|
i += 1
|
|
elif next == "n" and not nextnext:
|
|
i += 1
|
|
else:
|
|
result.append("k")
|
|
elif c == "h":
|
|
if i == 0 or next in "aeiou" or s[i - 1] not in "aeiou":
|
|
result.append("h")
|
|
elif c == "k":
|
|
if i == 0 or s[i - 1] != "c":
|
|
result.append("k")
|
|
elif c == "p":
|
|
if next == "h":
|
|
result.append("f")
|
|
i += 1
|
|
else:
|
|
result.append("p")
|
|
elif c == "q":
|
|
result.append("k")
|
|
elif c == "s":
|
|
if next == "h":
|
|
result.append("x")
|
|
i += 1
|
|
elif next == "i" and nextnext in "oa":
|
|
result.append("x")
|
|
i += 2
|
|
else:
|
|
result.append("s")
|
|
elif c == "t":
|
|
if next == "i" and nextnext in "oa":
|
|
result.append("x")
|
|
elif next == "h":
|
|
result.append("0")
|
|
i += 1
|
|
elif next != "c" or nextnext != "h":
|
|
result.append("t")
|
|
elif c == "v":
|
|
result.append("f")
|
|
elif c == "w":
|
|
if i == 0 and next == "h":
|
|
i += 1
|
|
result.append("w")
|
|
elif next in "aeiou":
|
|
result.append("w")
|
|
elif c == "x":
|
|
if i == 0:
|
|
if next == "h" or (next == "i" and nextnext in "oa"):
|
|
result.append("x")
|
|
else:
|
|
result.append("s")
|
|
else:
|
|
result.append("k")
|
|
result.append("s")
|
|
elif c == "y":
|
|
if next in "aeiou":
|
|
result.append("y")
|
|
elif c == "z":
|
|
result.append("s")
|
|
elif c == " ":
|
|
if len(result) > 0 and result[-1] != " ":
|
|
result.append(" ")
|
|
|
|
i += 1
|
|
|
|
return "".join(result).upper()
|
|
|
|
|
|
def porter_stem(s):
|
|
_check_type(s)
|
|
|
|
return Stemmer(s).stem()
|