nzbToMedia/libs/common/jellyfish/porter.py
Labrys of Knossos 56c6773c6b Update vendored beets to 1.6.0
Updates colorama to 0.4.6
Adds confuse version 1.7.0
Updates jellyfish to 0.9.0
Adds mediafile 0.10.1
Updates munkres to 1.1.4
Updates musicbrainzngs to 0.7.1
Updates mutagen to 1.46.0
Updates pyyaml to 6.0
Updates unidecode to 1.3.6
2022-11-29 00:44:48 -05:00

242 lines
6.8 KiB
Python

_s2_options = {
"a": (
(["a", "t", "i", "o", "n", "a", "l"], ["a", "t", "e"]),
(["t", "i", "o", "n", "a", "l"], ["t", "i", "o", "n"]),
),
"c": (
(["e", "n", "c", "i"], ["e", "n", "c", "e"]),
(["a", "n", "c", "i"], ["a", "n", "c", "e"]),
),
"e": ((["i", "z", "e", "r"], ["i", "z", "e"]),),
"l": (
(["b", "l", "i"], ["b", "l", "e"]),
(["a", "l", "l", "i"], ["a", "l"]),
(["e", "n", "t", "l", "i"], ["e", "n", "t"]),
(["e", "l", "i"], ["e"]),
(["o", "u", "s", "l", "i"], ["o", "u", "s"]),
),
"o": (
(["i", "z", "a", "t", "i", "o", "n"], ["i", "z", "e"]),
(["a", "t", "i", "o", "n"], ["a", "t", "e"]),
(["a", "t", "o", "r"], ["a", "t", "e"]),
),
"s": (
(["a", "l", "i", "s", "m"], ["a", "l"]),
(["i", "v", "e", "n", "e", "s", "s"], ["i", "v", "e"]),
(["f", "u", "l", "n", "e", "s", "s"], ["f", "u", "l"]),
(["o", "u", "s", "n", "e", "s", "s"], ["o", "u", "s"]),
),
"t": (
(["a", "l", "i", "t", "i"], ["a", "l"]),
(["i", "v", "i", "t", "i"], ["i", "v", "e"]),
(["b", "i", "l", "i", "t", "i"], ["b", "l", "e"]),
),
"g": ((["l", "o", "g", "i"], ["l", "o", "g"]),),
}
_s3_options = {
"e": (
(["i", "c", "a", "t", "e"], ["i", "c"]),
(["a", "t", "i", "v", "e"], []),
(["a", "l", "i", "z", "e"], ["a", "l"]),
),
"i": ((["i", "c", "i", "t", "i"], ["i", "c"]),),
"l": ((["i", "c", "a", "l"], ["i", "c"]), (["f", "u", "l"], [])),
"s": ((["n", "e", "s", "s"], []),),
}
_s4_endings = {
"a": (["a", "l"],),
"c": (["a", "n", "c", "e"], ["e", "n", "c", "e"]),
"e": (["e", "r"],),
"i": (["i", "c"],),
"l": (["a", "b", "l", "e"], ["i", "b", "l", "e"]),
"n": (
["a", "n", "t"],
["e", "m", "e", "n", "t"],
["m", "e", "n", "t"],
["e", "n", "t"],
),
# handle 'o' separately
"s": (["i", "s", "m"],),
"t": (["a", "t", "e"], ["i", "t", "i"]),
"u": (["o", "u", "s"],),
"v": (["i", "v", "e"],),
"z": (["i", "z", "e"],),
}
class Stemmer(object):
def __init__(self, b):
self.b = list(b)
self.k = len(b) - 1
self.j = 0
def cons(self, i):
""" True iff b[i] is a consonant """
if self.b[i] in "aeiou":
return False
elif self.b[i] == "y":
return True if i == 0 else not self.cons(i - 1)
return True
def m(self):
n = i = 0
while True:
if i > self.j:
return n
if not self.cons(i):
break
i += 1
i += 1
while True:
while True:
if i > self.j:
return n
if self.cons(i):
break
i += 1
i += 1
n += 1
while True:
if i > self.j:
return n
if not self.cons(i):
break
i += 1
i += 1
def vowel_in_stem(self):
""" True iff 0...j contains vowel """
for i in range(0, self.j + 1):
if not self.cons(i):
return True
return False
def doublec(self, j):
""" True iff j, j-1 contains double consonant """
if j < 1 or self.b[j] != self.b[j - 1]:
return False
return self.cons(j)
def cvc(self, i):
""" True iff i-2,i-1,i is consonant-vowel consonant
and if second c isn't w,x, or y.
used to restore e at end of short words like cave, love, hope, crime
"""
if (
i < 2
or not self.cons(i)
or self.cons(i - 1)
or not self.cons(i - 2)
or self.b[i] in "wxy"
):
return False
return True
def ends(self, s):
length = len(s)
""" True iff 0...k ends with string s """
res = self.b[self.k - length + 1 : self.k + 1] == s
if res:
self.j = self.k - length
return res
def setto(self, s):
""" set j+1...k to string s, readjusting k """
length = len(s)
self.b[self.j + 1 : self.j + 1 + length] = s
self.k = self.j + length
def r(self, s):
if self.m() > 0:
self.setto(s)
def step1ab(self):
if self.b[self.k] == "s":
if self.ends(["s", "s", "e", "s"]):
self.k -= 2
elif self.ends(["i", "e", "s"]):
self.setto(["i"])
elif self.b[self.k - 1] != "s":
self.k -= 1
if self.ends(["e", "e", "d"]):
if self.m() > 0:
self.k -= 1
elif (
self.ends(["e", "d"]) or self.ends(["i", "n", "g"])
) and self.vowel_in_stem():
self.k = self.j
if self.ends(["a", "t"]):
self.setto(["a", "t", "e"])
elif self.ends(["b", "l"]):
self.setto(["b", "l", "e"])
elif self.ends(["i", "z"]):
self.setto(["i", "z", "e"])
elif self.doublec(self.k):
self.k -= 1
if self.b[self.k] in "lsz":
self.k += 1
elif self.m() == 1 and self.cvc(self.k):
self.setto(["e"])
def step1c(self):
""" turn terminal y into i if there's a vowel in stem """
if self.ends(["y"]) and self.vowel_in_stem():
self.b[self.k] = "i"
def step2and3(self):
for end, repl in _s2_options.get(self.b[self.k - 1], []):
if self.ends(end):
self.r(repl)
break
for end, repl in _s3_options.get(self.b[self.k], []):
if self.ends(end):
self.r(repl)
break
def step4(self):
ch = self.b[self.k - 1]
if ch == "o":
if not (
(self.ends(["i", "o", "n"]) and self.b[self.j] in "st")
or self.ends(["o", "u"])
):
return
else:
endings = _s4_endings.get(ch, [])
for end in endings:
if self.ends(end):
break
else:
return
if self.m() > 1:
self.k = self.j
def step5(self):
self.j = self.k
if self.b[self.k] == "e":
a = self.m()
if a > 1 or a == 1 and not self.cvc(self.k - 1):
self.k -= 1
if self.b[self.k] == "l" and self.doublec(self.k) and self.m() > 1:
self.k -= 1
def result(self):
return "".join(self.b[: self.k + 1])
def stem(self):
if self.k > 1:
self.step1ab()
self.step1c()
self.step2and3()
self.step4()
self.step5()
return self.result()