LDA Topic Modeling with CORDIS Data
Contents
LDA Topic Modeling with CORDIS Data#
# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis
|████████████████████████████████| 24.1 MB 1.6 MB/s
|████████████████████████████████| 1.7 MB 4.7 MB/s
?25h Installing build dependencies ... ?25l?25hdone
Getting requirements to build wheel ... ?25l?25hdone
Installing backend dependencies ... ?25l?25hdone
Preparing wheel metadata ... ?25l?25hdone
Building wheel for pyLDAvis (PEP 517) ... ?25l?25hdone
Building wheel for sklearn (setup.py) ... ?25l?25hdone
# import packaging
import pandas as pd
import numpy as np
import spacy
# Import the dictionary builder
from gensim.corpora.dictionary import Dictionary
# we'll use the faster multicore version of LDA
from gensim.models import LdaMulticore
# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
%matplotlib inline
pyLDAvis.enable_notebook()
/usr/local/lib/python3.7/dist-packages/past/types/oldstr.py:5: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working
from collections import Iterable
/usr/local/lib/python3.7/dist-packages/past/builtins/misc.py:4: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working
from collections import Mapping
Open data and preprocess#
reports = pd.read_csv('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/cordis-h2020reports.gz')
# instantiate Spacy model for pre-pro
nlp = spacy.load('en_core_web_sm')
# preprocess texts (we need tokens)
tokens = []
for summary in nlp.pipe(reports['summary'], disable=["ner"]):
proj_tok = [token.lemma_.lower() for token in summary
if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV']
and not token.is_stop
and not token.is_punct]
tokens.append(proj_tok)
# put tokes into our dataframe
reports['tokens'] = tokens
reports['tokens'][0]
['polyaniline',
'historically',
'promising',
'conductive',
'polymer',
'cost',
'performance',
'perspective',
'processing',
'issue',
'uptake',
'seminal',
'work',
'polyaniline',
'cea',
'rescoll',
'independent',
'research',
'company',
'france',
'chemistry',
'material',
'new',
'electrically',
'conductive',
'polyaniline',
'formulation',
'trade',
'paniplast',
'â„¢',
'paniplast',
'polymer',
'safe',
'design',
'reach',
'compliant',
'low',
'cost',
'easily',
'material',
'highly',
'dispersion',
'high',
'electrical',
'conductivity',
'stability',
'unique',
'versatile',
'nature',
'paniplast',
'technology',
'new',
'opportunity',
'vast',
'range',
'product',
'excellent',
'electrical',
'conductivity',
'conductive',
'resin',
'conductive',
'coating',
'ultrathin',
'film',
'paniplast',
'material',
'great',
'market',
'potential',
'innovative',
'response',
'market',
'need',
'commercial',
'solution',
'available',
'today',
'cost',
'performance',
'reason',
'significant',
'market',
'interest',
'end',
'user',
'prepared',
'technology',
'product',
'rescoll',
'stilz',
'chimie',
'producer',
'paint',
'ink',
'formulation',
'order',
'pilot',
'industrial',
'scale',
'production',
'capability',
'challenge',
'technology',
'industrial',
'scale',
'order',
'performance',
'capability',
'cost',
'paniplast',
'conductive',
'polymer',
'market',
'demand',
'conductive',
'material']
LDA Model#
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(reports['tokens'])
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in reports['tokens']]
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, passes=10)
Visualization#
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
/usr/local/lib/python3.7/dist-packages/pyLDAvis/_prepare.py:247: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
by='saliency', ascending=False).head(R).drop('saliency', 1)
# Let's Visualize
pyLDAvis.display(lda_display)
# In case you run a website and want to publish it...or embed it in a blogpost...
pyLDAvis.save_html(lda_display, 'lda.html')
# And that's how you get the topic-number that's ranked highest
print(sorted([(2, 0.121567), (9, 0.8610384)], key=lambda x: -x[1]))
print(sorted([(2, 0.121567), (9, 0.8610384)], key=lambda x: -x[1])[0][0])
[(9, 0.8610384), (2, 0.121567)]
9