Latent Dirichlet Allocation code for R/R Studio

setwd(“C:/Users/user/Desktop/NASA_Administrators”)
## create data structures from text mining package, tm

library(tm)
library(topicmodels)
source <- DirSource(“/Users/user/Desktop/NASA_Administrators/Admin_txts”)
corpus <- Corpus(source, readerControl=list(reader=readPlain, language=”eng”))

summary(corpus)

corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c(stopwords(“english”), “national
aeronautics and space administration”,
“nasa”, “space”))
corpus <- tm_map(corpus, stemDocument, language = “english”)
dtm <- DocumentTermMatrix(corpus)

dtm <- removeSparseTerms(dtm, .75)

best.model <- lapply(seq(2,20, by=1), function(k){LDA(dtm, k)})
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))
best.model.logLik.df <- data.frame(topics=c(2:20),
LL=as.numeric(as.matrix(best.model.logLik)))
library(ggplot2)
ggplot(best.model.logLik.df, aes(x=topics, y=LL)) +
xlab(“Number of topics”) + ylab(“Log likelihood of the model”) +
geom_line() +
theme_bw()
best.model.logLik.df[which.max(best.model.logLik.df$LL),]

dtm_LDA_1 <- LDA(dtm, control=list(seed=0),4, method=”VEM”)
get_terms(dtm_LDA_1,50)
dtm_LDA_1 <- LDA(dtm, control=list(seed=0),6, method=”VEM”)
get_terms(dtm_LDA_1,10)

dtm_LDA_2 <- LDA(dtm, control=list(seed=0), 4, method=”VEM”)
get_terms(dtm_LDA_2,2)
identical(dtm_LDA_1, dtm_LDA_2)
all.equal(dtm_LDA_1, dtm_LDA_2)
a1 <- posterior(dtm_LDA_1)
a2 <- posterior(dtm_LDA_2)
identical(a1, a2)

harmonicMean <- function(logLikelihoods, precision=2000L) {
library(“Rmpfr”)
llMed <- median(logLikelihoods)
as.double(llMed – log(mean(exp(-mpfr(logLikelihoods, prec = precision) + llMed))))
}

k = 15
burnin = 1000
iter = 1000
keep = 50

fitted <- LDA(dtm, k = k, method = “Gibbs”,control = list(burnin = burnin,
iter = iter, keep = keep) )

logLiks <- fitted@logLiks[-c(1:(burnin/keep))]
harmonicMean(logLiks)
sequ <- seq(2, 100, 1)
fitted_many <- lapply(sequ, function(k) LDA(dtm, k = k,
method = “Gibbs”,
control = list(burnin = burnin,
iter = iter, keep =
keep) ))
logLiks_many <- lapply(fitted_many, function(L) L@logLiks[-c(1:(burnin/keep))])
plot(sequ, hm_many, type = “l”)
sequ[which.max(hm_many)]
## http://stackoverflow.com/questions/7927367/r-text-file-and-text-mining-how-to-load-data
## http://stats.stackexchange.com/questions/25113/the-input-parameters-for-using-latent-dirichlet-allocation
## http://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity/21394092#21394092
## http://stackoverflow.com/questions/22623309/how-to-reproduce-exact-results-with-lda-function-in-rs-topicmodels-package

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s