# LDA Topic Modeling with CORDIS Data

In [1]:
# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

[K     |████████████████████████████████| 24.1 MB 1.6 MB/s 
[K     |████████████████████████████████| 1.7 MB 4.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone


In [2]:
# import packaging

import pandas as pd
import numpy as np
import spacy

# Import the dictionary builder
from gensim.corpora.dictionary import Dictionary

# we'll use the faster multicore version of LDA

from gensim.models import LdaMulticore

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
%matplotlib inline
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


## Open data and preprocess

In [3]:
reports = pd.read_csv('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/cordis-h2020reports.gz')

In [4]:
# instantiate Spacy model for pre-pro
nlp = spacy.load('en_core_web_sm')


In [5]:
# preprocess texts (we need tokens)
tokens = []

for summary in nlp.pipe(reports['summary'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [6]:
# put tokes into our dataframe
reports['tokens'] = tokens

In [7]:
reports['tokens'][0]

['polyaniline',
 'historically',
 'promising',
 'conductive',
 'polymer',
 'cost',
 'performance',
 'perspective',
 'processing',
 'issue',
 'uptake',
 'seminal',
 'work',
 'polyaniline',
 'cea',
 'rescoll',
 'independent',
 'research',
 'company',
 'france',
 'chemistry',
 'material',
 'new',
 'electrically',
 'conductive',
 'polyaniline',
 'formulation',
 'trade',
 'paniplast',
 '™',
 'paniplast',
 'polymer',
 'safe',
 'design',
 'reach',
 'compliant',
 'low',
 'cost',
 'easily',
 'material',
 'highly',
 'dispersion',
 'high',
 'electrical',
 'conductivity',
 'stability',
 'unique',
 'versatile',
 'nature',
 'paniplast',
 'technology',
 'new',
 'opportunity',
 'vast',
 'range',
 'product',
 'excellent',
 'electrical',
 'conductivity',
 'conductive',
 'resin',
 'conductive',
 'coating',
 'ultrathin',
 'film',
 'paniplast',
 'material',
 'great',
 'market',
 'potential',
 'innovative',
 'response',
 'market',
 'need',
 'commercial',
 'solution',
 'available',
 'today',
 'cost',
 'perfo

## LDA Model

In [8]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(reports['tokens'])

In [9]:
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)

In [10]:
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in reports['tokens']]

In [11]:
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, passes=10)

## Visualization

In [12]:
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)


  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [13]:
 # Let's Visualize
pyLDAvis.display(lda_display)

In [14]:
# In case you run a website and want to publish it...or embed it in a blogpost...
pyLDAvis.save_html(lda_display, 'lda.html')

In [15]:
# And that's how you get the topic-number that's ranked highest

print(sorted([(2, 0.121567), (9, 0.8610384)], key=lambda x: -x[1]))
print(sorted([(2, 0.121567), (9, 0.8610384)], key=lambda x: -x[1])[0][0])

[(9, 0.8610384), (2, 0.121567)]
9
