After we obtained the news that can move the market, we can analyse and extract the common topics among them. This blog introduce how to conduct the Latent Semantic Analysis (LSA) and the Latent Dirichlet Allocation (LDA). We leveraged on the sklearn and genism library.

Text preprocessing

Firstly, we did text preprocessing. For example, removing special characters, short words and stop words, etc.

import pandas as pd 
import numpy as np 
import os 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
import re
import nltk
import matplotlib.colors as mcolors
# Fold warnings
warnings.filterwarnings('ignore')
os.chdir('/Users/wanjing_dai/Desktop/HKU/NLP_7036/Group project/nlp_group')
# Read news that is important
stock_news = pd.read_parquet('stock_news_and_mark.parquet')
mark_stock_news = stock_news[stock_news['news_mark'] == 1].reset_index(drop=True)
mark_stock_news = mark_stock_news[['PERMNO', 'date_news', 'headline']]
# Preprocessing text
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from collections import Counter
from nltk.probability import FreqDist
# Remove punctuations and special characters
mark_stock_news['clean_1'] = mark_stock_news['headline'].apply(lambda x: re.sub('[,\!?$:;.()%\']', '', x))
#Convert to lower case
mark_stock_news['clean_1'] = mark_stock_news['clean_1'].apply(lambda x: x.lower())
# Remove short words
mark_stock_news['clean_1'] = mark_stock_news['clean_1'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
# Stemming
wnl = WordNetLemmatizer()
mark_stock_news['clean_1'] = mark_stock_news['clean_1'].apply(lambda x: ' '.join([wnl.lemmatize(y) for y in x.split()]))
# Remove stopwords
stop_words = set(stopwords.words('english'))
mark_stock_news['clean_1'] = mark_stock_news['clean_1'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))
# Remove spaces
mark_stock_news['clean_1'] = mark_stock_news['clean_1'].apply(lambda x: x.strip())
# Tokenization
mark_stock_news['token'] = mark_stock_news['clean_1'].apply(lambda x: [w for w in word_tokenize(x.lower())])

Create a word cloud

Then, let's create a word cloud to have a taste on the news.

# Create a new column containing the length each headline text
mark_stock_news["headline_len"] = mark_stock_news["headline"].apply(lambda x : len(x.split()))
# Plot the news length distribution
sns.displot(mark_stock_news.headline_len, kde=False)
print("The longest headline has: {} words".format(mark_stock_news.headline_len.max()))
# Frequency distribution of words
tokens = mark_stock_news['token'].tolist()
flat_tokens = [item for sublist in tokens for item in sublist]
fd = FreqDist(flat_tokens)
# Plot top 15 words
fd.plot(15)
# Wordcloud
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(width=800,height=800,background_color='white',stopwords=STOPWORDS)\
.generate(' '.join(flat_tokens))
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud, interpolation='Bilinear') 
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

LSA and LDA analysis using sklearn

sklearn offered us packages to conduct LSA and LDA analysis, but not so powerful as genism.

# LSA analysis using sklearn
from sklearn.decomposition import TruncatedSVD
def get_keys(topic_matrix):
    '''
    returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys

def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
    '''
    returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(n_topics):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words
# Construct a CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(stop_words='english') 
# Construct a bag of words
bow = v.fit_transform(mark_stock_news['clean_1'])
# Set number of topics and train the data by SVD model
n_topics = 10
lsa_model = TruncatedSVD(n_components=n_topics)
lsa_topic_matrix = lsa_model.fit_transform(bow)
# Use the function we defined before to get the top 10 topics 
lsa_keys = get_keys(lsa_topic_matrix)
lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
top_n_words_lsa = get_top_n_words(5, lsa_keys, bow, v)
for i in range(len(top_n_words_lsa)):
    print("Topic {}: ".format(i+1), top_n_words_lsa[i])

# Plot the number of headlines containing topics generated by LSA
labels = ['Topic {}: \n'.format(i) + top_n_words_lsa[i] for i in lsa_categories]
fig, ax = plt.subplots(figsize=(40,20))
ax.bar(lsa_categories, lsa_counts);
ax.set_xticks(lsa_categories);
ax.set_xticklabels(labels);
ax.set_ylabel('Number of headlines');
ax.set_title('LSA topic counts');
plt.show()

# LDA analysis (Using Sklearn)
from sklearn.decomposition import LatentDirichletAllocation
# Train the data using LDA model
lda_model_s = LatentDirichletAllocation(n_components=n_topics, learning_method='online', 
                                          random_state=0, verbose=0)
lda_topic_matrix = lda_model_s.fit_transform(bow)
# Use the function we defined before to get the top 10 topics generated by LDA
lda_keys = get_keys(lda_topic_matrix)
lda_categories, lda_counts = keys_to_counts(lda_keys)
top_n_words_lda = get_top_n_words(5, lda_keys, bow, v)
for i in range(len(top_n_words_lda)):
    print("Topic {}: ".format(i+1), top_n_words_lda[i])

# Plot the number of headlines containing topics generated by LDA 
labels = ['Topic {}: \n'.format(i) + top_n_words_lda[i] for i in lda_categories]
fig, ax = plt.subplots(figsize=(16,8))
ax.bar(lda_categories, lda_counts);
ax.set_xticks(lda_categories);
ax.set_xticklabels(labels);
ax.set_title('LDA topic counts');
ax.set_ylabel('Number of headlines');
plt.show()

LDA approach using genism

Genism offered us more handy functions to do further analysis. As a result, our analysis focused on genism from now on.

# LDA analysis (Using genism, more powerful features)
import gensim, logging, warnings
import gensim.corpora as corpora
import matplotlib.pyplot as plt
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
tokens = mark_stock_news['token'].tolist()
# Construct a dictionary and corpus for the tokens
d = corpora.Dictionary(tokens)
cp = [d.doc2bow(text) for text in tokens]
# Get the top 10 topics generated by LDA
lda_model = gensim.models.ldamodel.LdaModel(corpus=cp,
                                           id2word=d,
                                           num_topics=4)
print(lda_model.print_topics(num_words=5))

# Find the Dominant topic and its percentage contribution in each document
def format_topics_sentences(ldamodel=lda_model, corpus=cp, texts=tokens):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                temp = pd.DataFrame()
                temp['Dominant_Topic']=[int(topic_num)]
                temp['Topic_Perc_Contrib']=[round(prop_topic,4)]
                temp['Topic_Keywords']=[topic_keywords]
                sent_topics_df = pd.concat([sent_topics_df, temp], ignore_index=True)
            else:
                break
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=cp, texts=tokens)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

# Display setting to show more characters in column
pd.options.display.max_colwidth = 100
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Topic_Perc_Contrib'], ascending=False).head(1)], 
                                            axis=0)
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]
# Show
sent_topics_sorteddf_mallet.head(10)

Visualize the results by graphs

As before, we would like to check the most popular topics among the news. To go one step further, we also investigate the importance of each keyword in the topics.
Last but not least, we can create an interactive graph. Each bubble represents a topic. The larger the bubble, the more news is about that topic. The further the bubbles are away from each other, the more different they are. Blue bars represent the overall frequency of each word. Red bars is the times that the term shown in a given topic.

# Distribution of Document Word Counts by Dominant Topic
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
fig, axes = plt.subplots(2,2,figsize=(16,14), dpi=160, sharex=True, sharey=True)
for i, ax in enumerate(axes.flatten()):    
    df_dominant_topic_sub = df_dominant_topic.loc[df_dominant_topic.Dominant_Topic == i, :]
    doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
    ax.hist(doc_lens, bins = 30, color=cols[i])
    ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i])
    sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())
    ax.set(xlim=(0, 30), xlabel='Document Word Count')
    ax.set_ylabel('Number of Documents', color=cols[i])
    ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=cols[i]))
fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.xticks(np.linspace(0,30,9))
fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
plt.show()

# Word Counts of Topic Keywords
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in tokens for w in w_list]
counter = Counter(data_flat)
out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count']) 

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0,0.15); ax.set_ylim(0, 95000)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

# Create an interactive graph.
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, cp, dictionary=lda_model.id2word)
vis

Published

Category

Progress Report

Tags

Contact