LDA Topic Modeling with CORDIS Data#
# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis
# import packaging
import pandas as pd
import numpy as np
import spacy
# Import the dictionary builder
from gensim.corpora.dictionary import Dictionary
# we'll use the faster multicore version of LDA
from gensim.models import LdaMulticore
# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
%matplotlib inline
Open data and preprocess#
reports = pd.read_csv('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/cordis-h2020reports.gz')
# instantiate Spacy model for pre-pro
nlp = spacy.load('en_core_web_sm')
# preprocess texts (we need tokens)
tokens = []
for summary in nlp.pipe(reports['summary'], disable=["ner"]):
proj_tok = [token.lemma_.lower() for token in summary
if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV']
and not token.is_stop
and not token.is_punct]
# put tokes into our dataframe
reports['tokens'] = tokens
LDA Model#
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(reports['tokens'])
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in reports['tokens']]
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, passes=10)
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
# Let's Visualize
# In case you run a website and want to publish it...or embed it in a blogpost...
pyLDAvis.save_html(lda_display, 'lda.html')
# And that's how you get the topic-number that's ranked highest
print(sorted([(2, 0.121567), (9, 0.8610384)], key=lambda x: -x[1]))
print(sorted([(2, 0.121567), (9, 0.8610384)], key=lambda x: -x[1])[0][0])
[(9, 0.8610384), (2, 0.121567)]