LDA Topic Modeling with CORDIS Data#

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis
     |████████████████████████████████| 24.1 MB 1.6 MB/s 
     |████████████████████████████████| 1.7 MB 4.7 MB/s 
?25h  Installing build dependencies ... ?25l?25hdone
  Getting requirements to build wheel ... ?25l?25hdone
  Installing backend dependencies ... ?25l?25hdone
    Preparing wheel metadata ... ?25l?25hdone
  Building wheel for pyLDAvis (PEP 517) ... ?25l?25hdone
  Building wheel for sklearn (setup.py) ... ?25l?25hdone
# import packaging

import pandas as pd
import numpy as np
import spacy

# Import the dictionary builder
from gensim.corpora.dictionary import Dictionary

# we'll use the faster multicore version of LDA

from gensim.models import LdaMulticore

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
%matplotlib inline
pyLDAvis.enable_notebook()
/usr/local/lib/python3.7/dist-packages/past/types/oldstr.py:5: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working
  from collections import Iterable
/usr/local/lib/python3.7/dist-packages/past/builtins/misc.py:4: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working
  from collections import Mapping

Open data and preprocess#

reports = pd.read_csv('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/cordis-h2020reports.gz')
# instantiate Spacy model for pre-pro
nlp = spacy.load('en_core_web_sm')
# preprocess texts (we need tokens)
tokens = []

for summary in nlp.pipe(reports['summary'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)
# put tokes into our dataframe
reports['tokens'] = tokens
reports['tokens'][0]
['polyaniline',
 'historically',
 'promising',
 'conductive',
 'polymer',
 'cost',
 'performance',
 'perspective',
 'processing',
 'issue',
 'uptake',
 'seminal',
 'work',
 'polyaniline',
 'cea',
 'rescoll',
 'independent',
 'research',
 'company',
 'france',
 'chemistry',
 'material',
 'new',
 'electrically',
 'conductive',
 'polyaniline',
 'formulation',
 'trade',
 'paniplast',
 'â„¢',
 'paniplast',
 'polymer',
 'safe',
 'design',
 'reach',
 'compliant',
 'low',
 'cost',
 'easily',
 'material',
 'highly',
 'dispersion',
 'high',
 'electrical',
 'conductivity',
 'stability',
 'unique',
 'versatile',
 'nature',
 'paniplast',
 'technology',
 'new',
 'opportunity',
 'vast',
 'range',
 'product',
 'excellent',
 'electrical',
 'conductivity',
 'conductive',
 'resin',
 'conductive',
 'coating',
 'ultrathin',
 'film',
 'paniplast',
 'material',
 'great',
 'market',
 'potential',
 'innovative',
 'response',
 'market',
 'need',
 'commercial',
 'solution',
 'available',
 'today',
 'cost',
 'performance',
 'reason',
 'significant',
 'market',
 'interest',
 'end',
 'user',
 'prepared',
 'technology',
 'product',
 'rescoll',
 'stilz',
 'chimie',
 'producer',
 'paint',
 'ink',
 'formulation',
 'order',
 'pilot',
 'industrial',
 'scale',
 'production',
 'capability',
 'challenge',
 'technology',
 'industrial',
 'scale',
 'order',
 'performance',
 'capability',
 'cost',
 'paniplast',
 'conductive',
 'polymer',
 'market',
 'demand',
 'conductive',
 'material']

LDA Model#

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(reports['tokens'])
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in reports['tokens']]
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, passes=10)

Visualization#

# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
/usr/local/lib/python3.7/dist-packages/pyLDAvis/_prepare.py:247: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
  by='saliency', ascending=False).head(R).drop('saliency', 1)
 # Let's Visualize
pyLDAvis.display(lda_display)
# In case you run a website and want to publish it...or embed it in a blogpost...
pyLDAvis.save_html(lda_display, 'lda.html')
# And that's how you get the topic-number that's ranked highest

print(sorted([(2, 0.121567), (9, 0.8610384)], key=lambda x: -x[1]))
print(sorted([(2, 0.121567), (9, 0.8610384)], key=lambda x: -x[1])[0][0])
[(9, 0.8610384), (2, 0.121567)]
9