Security & Twitter: A Semantic Similarity Experiment of Tweets from CEOs

In [128]:
import tweepy,re,apiKey,csv,retinasdk, random, dateutil,pytz
import pandas as pd
import numpy as np
import gensim
from gensim import utils
from gensim.models import Doc2Vec
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pandas_datareader as pdr
import plotly.tools as tls
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
In [129]:
#Help Functions: format transformation
def time_transfer(created_at):
        new_time = dateutil.parser.parse(created_at).astimezone(pytz.utc)
        return new_time
In [130]:
#Function1: Read CSV File to Get Tick and Tweeter Account of CEOs
def get_tweet_info (csv_name):
    tweet_info = {}
    with open(str(csv_name)) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            tweet_info[row[0]] = row[1]
    pass
    return tweet_info
In [131]:
#Function2: Get Tweets & Timeline
def get_all_tweets_with_filter(screen_name):
    print("Start to retrive tweets from @" + screen_name + "...")
    auth = tweepy.OAuthHandler(twitter_customer, twitter_customer_secret)
    auth.set_access_token(twitter_token, twitter_secret)
    api = tweepy.API(auth)

    alltweets = []
    user = api.get_user(screen_name=screen_name)
    new_tweets = api.user_timeline(screen_name=screen_name, count=50) 
    alltweets.extend(new_tweets)
    print("Start to clean " + str(len(alltweets)) + " tweets from " + screen_name)
    start_t = end_t = time_transfer("Thu Nov 01 00:00:00 +0000 2018")
    outtweets = []
    for tweet in alltweets:
        emoji_pattern = re.compile("["
                       u"\U0001F600-\U0001F64F"  
                       u"\U0001F300-\U0001F5FF"  
                       u"\U0001F680-\U0001F6FF"  
                       u"\U0001F1E0-\U0001F1FF"  
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       "]+", flags=re.UNICODE)
        te_text = re.sub(emoji_pattern, "", tweet.text)
        tweet_content = clean_text(str(te_text).encode('ascii','ignore'))
        te_time = time_transfer(str(tweet.created_at))
        if te_time < start_t:
            start_t = te_time
        elif te_time > end_t:
            end_t = te_time
        if tweet_content and (not tweet_content.isspace()) and len(tweet_content)>0:
            outtweets.append([str(tweet_content),te_time])
    print("Finish cleaning tweets from @" + screen_name)
    return [outtweets, start_t, end_t]

def clean_text(twitter_text):
    remove_http = twitter_text.decode('utf_8').replace(r"http\S+", "").replace(r"http", "")
    no_b = remove_http.replace('b\'RT', '').replace('\'b', '').replace('RT','').replace('b\'','').replace('\'','')
    no_at = no_b.replace('@', '')
    return no_at.replace('/n','')
In [132]:
#Function3: Keyword Retriver from Each Tweet
def keyword_retrieve(tweet):
    liteClient = retinasdk.LiteClient(apiKey.retina_token)
    out_keywords = liteClient.getKeywords(tweet)
    return out_keywords
In [133]:
#Function 4: Train Model Using Dec2Vec
def train_model():
    train_corpus = []
    with open('Sentiment-Analysis-Dataset-Debug.csv') as ds_file:
        ds_reader = csv.reader(ds_file, delimiter=',')
        for r in ds_reader:
            train_corpus.append(read_corpus(r, False, ds_reader.line_num))
    model = Doc2Vec(vector_size=50, min_count=2, epochs=40) 
    model.build_vocab(train_corpus)  
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return [train_corpus,model]
  
def train_data():
    train_data = []
    with open('Sentiment-Analysis-Dataset.csv') as ds_file:
        ds_reader = csv.reader(ds_file, delimiter=',')
        for r in ds_reader:
            if int(line[1]) == 0:
                train_data.append((r[3],'neg'))
            else:
                train_data.append((r[3],'pos'))
    return train_data

def read_corpus(line, tokens_only=False, i = 1):
    if tokens_only:
        return gensim.utils.simple_preprocess(line)
    else:
        return gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line[3]), [str(line[1])+str(i)])
In [144]:
def compare(csv_name):
    tweet_info_list = get_tweet_info(str(csv_name))
    for corp,ceo in tweet_info_list.items():  
        ceo_tweet = get_all_tweets_with_filter(ceo) 
        print("Plotting for " + corp + '...')  
        if len(ceo_tweet[0])>0:
            all_create_at = []
            all_emotion = []  
            all_emotion_blob = []
            for outtweet in ceo_tweet[0]: 
                #outtweet:[content_of_tweet,create_at]
                test_str = outtweet[0]
                create_at = outtweet[1]
                test_corpus = keyword_retrieve(test_str.encode('latin-1', 'ignore'))
                #find most similar doc from training set, use it as the emotion standard for this sentence
                inferred_vector = model.infer_vector(test_corpus)
                sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
                 #Only get tweets with percentage > 80%
                chance = (100 * float(sims[0][1]))
                if chance > 0.8:
                    location = str(sims[0][0][1:])
                    emotion = int(sims[0][0][0])  
                    blob = TextBlob(test_str)
                    s = 0
                    for sentence in blob.sentences:
                        s = s + sentence.sentiment.polarity
                    all_emotion_blob.append(s)
                    all_create_at.append(create_at)
                    all_emotion.append(emotion)

            #Plot Evidence 
            start_date = ceo_tweet[1].replace(tzinfo=None) 
            stop_date = ceo_tweet[2].replace(tzinfo=None)
            #f = plt.figure(figsize=(20,10))
            #ax = f.add_subplot(2,1,1)
            fig, ax1 = plt.subplots(figsize =(15,10))
            stock = pdr.data.DataReader(str(corp), 'iex', start_date, stop_date)
            stock.index = pd.to_datetime(stock.index)
            stock_c = stock['close']  
            
            #ax.plot(stock_c,'r--',label='stock price')
            plt.title(str(corp),fontsize = 20)
            #plt.xlabel('time')
            #plt.ylabel('price')

            #Plot Opinions. Is the trend same?
            #f.set_figheight(5)
            #ax2 = f.add_subplot(2,1,2)
            color = 'tab:red'
            ax1.set_xlabel('time')
            ax1.set_ylabel('price', color=color)
            ax1.plot(stock['close'], c='r', ls=':', lw=3,label='stock price') 
            ax1.tick_params(axis='y', labelcolor=color)

            ax2 = ax1.twinx()
            color = 'tab:blue'
            ax2.set_ylabel('emotion', color=color)  
            ax2.plot(all_create_at,all_emotion_blob,c='b', ls='--', lw=3,label='emotion sum')
            ax2.tick_params(axis='y', labelcolor=color)
#             ax3 = f.add_subplot(2,1,2)
#             ax3.plot(all_create_at,all_emotion,'r--',label='emotion change')
            plt.xlabel('create_at')
            plt.ylabel('emotion')
            plt.tight_layout()
            plt.show()
In [124]:
##########################
# Start of Main Program
##########################
# %pylab inline
In [145]:
#Tech Industry
compare('5CEO.csv')
Start to retrive tweets from @tim_cook...
Start to clean 50 tweets from tim_cook
Finish cleaning tweets from @tim_cook
Plotting for AAPL...
Start to retrive tweets from @GoogleInstantly...
Start to clean 50 tweets from GoogleInstantly
Finish cleaning tweets from @GoogleInstantly
Plotting for GOOGL...
Start to retrive tweets from @jack...
Start to clean 50 tweets from jack
Finish cleaning tweets from @jack
Plotting for TWTR...
Start to retrive tweets from @JeffBezos...
Start to clean 50 tweets from JeffBezos
Finish cleaning tweets from @JeffBezos
Plotting for AMZN...
Start to retrive tweets from @satyanadella...
Start to clean 50 tweets from satyanadella
Finish cleaning tweets from @satyanadella
Plotting for MSFT...
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-145-a01011499533> in <module>
      1 #Tech Industry
----> 2 compare('5CEO.csv')

<ipython-input-144-a377fba008dc> in compare(csv_name)
     15                 #find most similar doc from training set, use it as the emotion standard for this sentence
     16                 inferred_vector = model.infer_vector(test_corpus)
---> 17                 sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
     18                  #Only get tweets with percentage > 80%
     19                 chance = (100 * float(sims[0][1]))

~/security/lib/python3.6/site-packages/gensim/models/keyedvectors.py in most_similar(self, positive, negative, topn, clip_start, clip_end, indexer)
   1666         result = [
   1667             (self._index_to_doctag(sim + clip_start, self.offset2doctag, self.max_rawint), float(dists[sim]))
-> 1668             for sim in best
   1669             if (sim + clip_start) not in all_docs
   1670         ]

~/security/lib/python3.6/site-packages/gensim/models/keyedvectors.py in <listcomp>(.0)
   1667             (self._index_to_doctag(sim + clip_start, self.offset2doctag, self.max_rawint), float(dists[sim]))
   1668             for sim in best
-> 1669             if (sim + clip_start) not in all_docs
   1670         ]
   1671         return result[:topn]

~/security/lib/python3.6/site-packages/gensim/models/keyedvectors.py in _index_to_doctag(i_index, offset2doctag, max_rawint)
   1852         """Get string key for given `i_index`, if available. Otherwise return raw int doctag (same int)."""
   1853         candidate_offset = i_index - max_rawint - 1
-> 1854         if 0 <= candidate_offset < len(offset2doctag):
   1855             return offset2doctag[candidate_offset]
   1856         else:

KeyboardInterrupt: