security slides
Security & Twitter: A Semantic Similarity Experiment of Tweets from CEOs¶
In [128]:
import tweepy,re,apiKey,csv,retinasdk, random, dateutil,pytz
import pandas as pd
import numpy as np
import gensim
from gensim import utils
from gensim.models import Doc2Vec
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pandas_datareader as pdr
import plotly.tools as tls
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
In [129]:
#Help Functions: format transformation
def time_transfer(created_at):
new_time = dateutil.parser.parse(created_at).astimezone(pytz.utc)
return new_time
In [130]:
#Function1: Read CSV File to Get Tick and Tweeter Account of CEOs
def get_tweet_info (csv_name):
tweet_info = {}
with open(str(csv_name)) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
tweet_info[row[0]] = row[1]
pass
return tweet_info
In [131]:
#Function2: Get Tweets & Timeline
def get_all_tweets_with_filter(screen_name):
print("Start to retrive tweets from @" + screen_name + "...")
auth = tweepy.OAuthHandler(twitter_customer, twitter_customer_secret)
auth.set_access_token(twitter_token, twitter_secret)
api = tweepy.API(auth)
alltweets = []
user = api.get_user(screen_name=screen_name)
new_tweets = api.user_timeline(screen_name=screen_name, count=50)
alltweets.extend(new_tweets)
print("Start to clean " + str(len(alltweets)) + " tweets from " + screen_name)
start_t = end_t = time_transfer("Thu Nov 01 00:00:00 +0000 2018")
outtweets = []
for tweet in alltweets:
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
te_text = re.sub(emoji_pattern, "", tweet.text)
tweet_content = clean_text(str(te_text).encode('ascii','ignore'))
te_time = time_transfer(str(tweet.created_at))
if te_time < start_t:
start_t = te_time
elif te_time > end_t:
end_t = te_time
if tweet_content and (not tweet_content.isspace()) and len(tweet_content)>0:
outtweets.append([str(tweet_content),te_time])
print("Finish cleaning tweets from @" + screen_name)
return [outtweets, start_t, end_t]
def clean_text(twitter_text):
remove_http = twitter_text.decode('utf_8').replace(r"http\S+", "").replace(r"http", "")
no_b = remove_http.replace('b\'RT', '').replace('\'b', '').replace('RT','').replace('b\'','').replace('\'','')
no_at = no_b.replace('@', '')
return no_at.replace('/n','')
In [132]:
#Function3: Keyword Retriver from Each Tweet
def keyword_retrieve(tweet):
liteClient = retinasdk.LiteClient(apiKey.retina_token)
out_keywords = liteClient.getKeywords(tweet)
return out_keywords
In [133]:
#Function 4: Train Model Using Dec2Vec
def train_model():
train_corpus = []
with open('Sentiment-Analysis-Dataset-Debug.csv') as ds_file:
ds_reader = csv.reader(ds_file, delimiter=',')
for r in ds_reader:
train_corpus.append(read_corpus(r, False, ds_reader.line_num))
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
return [train_corpus,model]
def train_data():
train_data = []
with open('Sentiment-Analysis-Dataset.csv') as ds_file:
ds_reader = csv.reader(ds_file, delimiter=',')
for r in ds_reader:
if int(line[1]) == 0:
train_data.append((r[3],'neg'))
else:
train_data.append((r[3],'pos'))
return train_data
def read_corpus(line, tokens_only=False, i = 1):
if tokens_only:
return gensim.utils.simple_preprocess(line)
else:
return gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line[3]), [str(line[1])+str(i)])
In [144]:
def compare(csv_name):
tweet_info_list = get_tweet_info(str(csv_name))
for corp,ceo in tweet_info_list.items():
ceo_tweet = get_all_tweets_with_filter(ceo)
print("Plotting for " + corp + '...')
if len(ceo_tweet[0])>0:
all_create_at = []
all_emotion = []
all_emotion_blob = []
for outtweet in ceo_tweet[0]:
#outtweet:[content_of_tweet,create_at]
test_str = outtweet[0]
create_at = outtweet[1]
test_corpus = keyword_retrieve(test_str.encode('latin-1', 'ignore'))
#find most similar doc from training set, use it as the emotion standard for this sentence
inferred_vector = model.infer_vector(test_corpus)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
#Only get tweets with percentage > 80%
chance = (100 * float(sims[0][1]))
if chance > 0.8:
location = str(sims[0][0][1:])
emotion = int(sims[0][0][0])
blob = TextBlob(test_str)
s = 0
for sentence in blob.sentences:
s = s + sentence.sentiment.polarity
all_emotion_blob.append(s)
all_create_at.append(create_at)
all_emotion.append(emotion)
#Plot Evidence
start_date = ceo_tweet[1].replace(tzinfo=None)
stop_date = ceo_tweet[2].replace(tzinfo=None)
#f = plt.figure(figsize=(20,10))
#ax = f.add_subplot(2,1,1)
fig, ax1 = plt.subplots(figsize =(15,10))
stock = pdr.data.DataReader(str(corp), 'iex', start_date, stop_date)
stock.index = pd.to_datetime(stock.index)
stock_c = stock['close']
#ax.plot(stock_c,'r--',label='stock price')
plt.title(str(corp),fontsize = 20)
#plt.xlabel('time')
#plt.ylabel('price')
#Plot Opinions. Is the trend same?
#f.set_figheight(5)
#ax2 = f.add_subplot(2,1,2)
color = 'tab:red'
ax1.set_xlabel('time')
ax1.set_ylabel('price', color=color)
ax1.plot(stock['close'], c='r', ls=':', lw=3,label='stock price')
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('emotion', color=color)
ax2.plot(all_create_at,all_emotion_blob,c='b', ls='--', lw=3,label='emotion sum')
ax2.tick_params(axis='y', labelcolor=color)
# ax3 = f.add_subplot(2,1,2)
# ax3.plot(all_create_at,all_emotion,'r--',label='emotion change')
plt.xlabel('create_at')
plt.ylabel('emotion')
plt.tight_layout()
plt.show()
In [124]:
##########################
# Start of Main Program
##########################
# %pylab inline
In [145]:
#Tech Industry
compare('5CEO.csv')
Start to retrive tweets from @tim_cook... Start to clean 50 tweets from tim_cook Finish cleaning tweets from @tim_cook Plotting for AAPL...
Start to retrive tweets from @GoogleInstantly... Start to clean 50 tweets from GoogleInstantly Finish cleaning tweets from @GoogleInstantly Plotting for GOOGL...
Start to retrive tweets from @jack... Start to clean 50 tweets from jack Finish cleaning tweets from @jack Plotting for TWTR...
Start to retrive tweets from @JeffBezos... Start to clean 50 tweets from JeffBezos Finish cleaning tweets from @JeffBezos Plotting for AMZN...
Start to retrive tweets from @satyanadella... Start to clean 50 tweets from satyanadella Finish cleaning tweets from @satyanadella Plotting for MSFT...
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-145-a01011499533> in <module> 1 #Tech Industry ----> 2 compare('5CEO.csv') <ipython-input-144-a377fba008dc> in compare(csv_name) 15 #find most similar doc from training set, use it as the emotion standard for this sentence 16 inferred_vector = model.infer_vector(test_corpus) ---> 17 sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) 18 #Only get tweets with percentage > 80% 19 chance = (100 * float(sims[0][1])) ~/security/lib/python3.6/site-packages/gensim/models/keyedvectors.py in most_similar(self, positive, negative, topn, clip_start, clip_end, indexer) 1666 result = [ 1667 (self._index_to_doctag(sim + clip_start, self.offset2doctag, self.max_rawint), float(dists[sim])) -> 1668 for sim in best 1669 if (sim + clip_start) not in all_docs 1670 ] ~/security/lib/python3.6/site-packages/gensim/models/keyedvectors.py in <listcomp>(.0) 1667 (self._index_to_doctag(sim + clip_start, self.offset2doctag, self.max_rawint), float(dists[sim])) 1668 for sim in best -> 1669 if (sim + clip_start) not in all_docs 1670 ] 1671 return result[:topn] ~/security/lib/python3.6/site-packages/gensim/models/keyedvectors.py in _index_to_doctag(i_index, offset2doctag, max_rawint) 1852 """Get string key for given `i_index`, if available. Otherwise return raw int doctag (same int).""" 1853 candidate_offset = i_index - max_rawint - 1 -> 1854 if 0 <= candidate_offset < len(offset2doctag): 1855 return offset2doctag[candidate_offset] 1856 else: KeyboardInterrupt: