plexpy/lib/twitter/parse_tweet.py
2021-10-15 01:56:57 -07:00

101 lines
3.5 KiB
Python

#!/usr/bin/env python
import re
class Emoticons:
POSITIVE = ["*O", "*-*", "*O*", "*o*", "* *",
":P", ":D", ":d", ":p",
";P", ";D", ";d", ";p",
":-)", ";-)", ":=)", ";=)",
":<)", ":>)", ";>)", ";=)",
"=}", ":)", "(:;)",
"(;", ":}", "{:", ";}",
"{;:]",
"[;", ":')", ";')", ":-3",
"{;", ":]",
";-3", ":-x", ";-x", ":-X",
";-X", ":-}", ";-=}", ":-]",
";-]", ":-.)",
"^_^", "^-^"]
NEGATIVE = [":(", ";(", ":'(",
"=(", "={", "):", ");",
")':", ")';", ")=", "}=",
";-{{", ";-{", ":-{{", ":-{",
":-(", ";-(",
":,)", ":'{",
"[:", ";]"
]
class ParseTweet(object):
# compile once on import
regexp = {"RT": "^RT", "MT": r"^MT", "ALNUM": r"(@[a-zA-Z0-9_]+)",
"HASHTAG": r"(#[\w\d]+)", "URL": r"([https://|http://]?[a-zA-Z\d\/]+[\.]+[a-zA-Z\d\/\.]+)",
"SPACES": r"\s+"}
regexp = dict((key, re.compile(value)) for key, value in regexp.items())
def __init__(self, timeline_owner, tweet):
""" timeline_owner : twitter handle of user account. tweet - 140 chars from feed; object does all computation on construction
properties:
RT, MT - boolean
URLs - list of URL
Hashtags - list of tags
"""
self.Owner = timeline_owner
self.tweet = tweet
self.UserHandles = ParseTweet.getUserHandles(tweet)
self.Hashtags = ParseTweet.getHashtags(tweet)
self.URLs = ParseTweet.getURLs(tweet)
self.RT = ParseTweet.getAttributeRT(tweet)
self.MT = ParseTweet.getAttributeMT(tweet)
self.Emoticon = ParseTweet.getAttributeEmoticon(tweet)
# additional intelligence
if (self.RT and len(self.UserHandles) > 0): # change the owner of tweet?
self.Owner = self.UserHandles[0]
return
def __str__(self):
""" for display method """
return "owner %s, urls: %d, hashtags %d, user_handles %d, len_tweet %d, RT = %s, MT = %s" % \
(self.Owner, len(self.URLs), len(self.Hashtags), len(self.UserHandles), len(self.tweet), self.RT, self.MT)
@staticmethod
def getAttributeEmoticon(tweet):
""" see if tweet is contains any emoticons, +ve, -ve or neutral """
emoji = list()
for tok in re.split(ParseTweet.regexp["SPACES"], tweet.strip()):
if tok in Emoticons.POSITIVE:
emoji.append(tok)
continue
if tok in Emoticons.NEGATIVE:
emoji.append(tok)
return emoji
@staticmethod
def getAttributeRT(tweet):
""" see if tweet is a RT """
return re.search(ParseTweet.regexp["RT"], tweet.strip()) is not None
@staticmethod
def getAttributeMT(tweet):
""" see if tweet is a MT """
return re.search(ParseTweet.regexp["MT"], tweet.strip()) is not None
@staticmethod
def getUserHandles(tweet):
""" given a tweet we try and extract all user handles in order of occurrence"""
return re.findall(ParseTweet.regexp["ALNUM"], tweet)
@staticmethod
def getHashtags(tweet):
""" return all hashtags"""
return re.findall(ParseTweet.regexp["HASHTAG"], tweet)
@staticmethod
def getURLs(tweet):
r""" URL : [http://]?[\w\.?/]+"""
return re.findall(ParseTweet.regexp["URL"], tweet)