Resume

 

145210main_image_feature_534_ys_full

(John Glenn, 1962. Credit: NASA)

.pdf available here: NA

 

 

 

Dominique M. Awis

 

Advertisements

Latent Dirichlet Allocation code for R/R Studio

setwd(“C:/Users/user/Desktop/NASA_Administrators”)
## create data structures from text mining package, tm

library(tm)
library(topicmodels)
source <- DirSource(“/Users/user/Desktop/NASA_Administrators/Admin_txts”)
corpus <- Corpus(source, readerControl=list(reader=readPlain, language=”eng”))

summary(corpus)

corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c(stopwords(“english”), “national
aeronautics and space administration”,
“nasa”, “space”))
corpus <- tm_map(corpus, stemDocument, language = “english”)
dtm <- DocumentTermMatrix(corpus)

dtm <- removeSparseTerms(dtm, .75)

best.model <- lapply(seq(2,20, by=1), function(k){LDA(dtm, k)})
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))
best.model.logLik.df <- data.frame(topics=c(2:20),
LL=as.numeric(as.matrix(best.model.logLik)))
library(ggplot2)
ggplot(best.model.logLik.df, aes(x=topics, y=LL)) +
xlab(“Number of topics”) + ylab(“Log likelihood of the model”) +
geom_line() +
theme_bw()
best.model.logLik.df[which.max(best.model.logLik.df$LL),]

dtm_LDA_1 <- LDA(dtm, control=list(seed=0),4, method=”VEM”)
get_terms(dtm_LDA_1,50)
dtm_LDA_1 <- LDA(dtm, control=list(seed=0),6, method=”VEM”)
get_terms(dtm_LDA_1,10)

dtm_LDA_2 <- LDA(dtm, control=list(seed=0), 4, method=”VEM”)
get_terms(dtm_LDA_2,2)
identical(dtm_LDA_1, dtm_LDA_2)
all.equal(dtm_LDA_1, dtm_LDA_2)
a1 <- posterior(dtm_LDA_1)
a2 <- posterior(dtm_LDA_2)
identical(a1, a2)

harmonicMean <- function(logLikelihoods, precision=2000L) {
library(“Rmpfr”)
llMed <- median(logLikelihoods)
as.double(llMed – log(mean(exp(-mpfr(logLikelihoods, prec = precision) + llMed))))
}

k = 15
burnin = 1000
iter = 1000
keep = 50

fitted <- LDA(dtm, k = k, method = “Gibbs”,control = list(burnin = burnin,
iter = iter, keep = keep) )

logLiks <- fitted@logLiks[-c(1:(burnin/keep))]
harmonicMean(logLiks)
sequ <- seq(2, 100, 1)
fitted_many <- lapply(sequ, function(k) LDA(dtm, k = k,
method = “Gibbs”,
control = list(burnin = burnin,
iter = iter, keep =
keep) ))
logLiks_many <- lapply(fitted_many, function(L) L@logLiks[-c(1:(burnin/keep))])
plot(sequ, hm_many, type = “l”)
sequ[which.max(hm_many)]
## http://stackoverflow.com/questions/7927367/r-text-file-and-text-mining-how-to-load-data
## http://stats.stackexchange.com/questions/25113/the-input-parameters-for-using-latent-dirichlet-allocation
## http://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity/21394092#21394092
## http://stackoverflow.com/questions/22623309/how-to-reproduce-exact-results-with-lda-function-in-rs-topicmodels-package

Word Cloud code for R/R Studio

## Text modeling
# Word Cloud

### Set working directory
setwd(“C:/Users/spaceloss/Desktop”)

## text file

text
# Word Cloud

library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
source <- DirSource(“/Users/spaceloss/Desktop/text”)
source <- DirSource(“/Users/spaceloss/Desktop/text”)
corpus <- Corpus(source, readerControl=list(reader=readPlain, language=”eng”))

summary(corpus)

corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, c(stopwords(“english”), “musk”))

wordcloud(corpus, max.words = 1000
,random.order = FALSE)

Introductory R Code

###################################################################################

# INTRODUCTORY R CODE

########################################
## download both R and R Studio; work out of R Studio
## download csv file -or- convert/export spreadsheet to .csv file

## R Studio is pretty customizable if you prefer larger code workspace or larger
## console, etc, swap quadrants of workspace, or change colors in source file
# window for viewing ease, etc:
## From MENU: -> TOOLS -> GLOBAL OPTIONS -> APPEARANCE
## at first, maybe try just copy/pasting variable names then messing around
## with parameters/options
## Quick R and Stackoverflow are good resources

## control + enter when curser is on line to execute code in console
## control+shift C to comment out source code (#)
##################################################################################
## LOADING DATA AND VARIOUS BASIC ANALYSIS

## set working directory

## From MENU: -> SESSION -> SET WORKING DIRECTORY -> TO SOURCE FILE LOCATION
## — OR
setwd(“C:/Users/SPACELOSS/Dropbox/KR & dA”)
## load data from .csv file (convert/export spreadsheet to .csv file)
## give data set (file) a name
## view variables

Data attach(Data)
names(Data)

# returns variable names:

# [1] “Year” “apollo_era” “Space_Cong”
# [4] “Space_Budg” “Percent_Chg”

#######################################################

## Dependent Variable Analysis

################################################
## nickname variable – “Space_Budg” to “DV” for ease

DV

# if needing to specify which dataset variable came from in case of multiple sets
DV

# if need to subset data by Year, etc and remove NA’s

DV49 = 1949, na.rm=TRUE)

## create density plot then show in workspace
d1 plot(d1)
#histogram & set breaks
h1 <-hist(DV, breaks=40)
#plot
plot(h1)
# add a fit of normal distribution using data mean (mu),
## and standard deviation (sigma); color – darkblue; line width = 2
d2 Space_Cong
## y -> ABSpace_BudgLag
reg1 summary(reg1)
confint(reg1, level=0.95)

# add control variables with + DV2 + DV3, etc
reg1a summary(reg1a)

###########

## download ‘stargazer’ package for LaTeX code output of pro tables (only once)
install.packages(“stargazer”)

## load package every session
library(stargazer)

# should return LaTeX code
stargazer(reg1)

# Looks like:

# % Table created by stargazer v.5.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
# % Date and time: Sat, Aug 06, 2016 – 6:51:19 AM
# \begin{table}[!htbp] \centering
# \caption{}
# etc…
####################################################
## plot customized regression with error margins, labels, etc.

# Plot of Regression 1 – reg1

par(mar=c(5, 5, 4, 5) + 0.1)
newx 0.01)
a interval=”confidence”)
plot Spending Change: 1949-2013″, axes=FALSE, xlab=”Congressional Attention”,
ylab=””, col=’orangered’)
mtext(“Spending Change”, side=2, line=4.0)
box()
axis(2, at=pretty(ABSpace_BudgLag), lab=paste0(pretty(ABSpace_BudgLag) * 100,
” %”), las=1)
axis(1, at=pretty(Space_Cong), lab=paste0(pretty(Space_Cong) * 100, ” %”),
ylim=c(0,.1), las=1)

## add regression line
abline(lm(ABSpace_BudgLag~Space_Cong), col=”darkmagenta”)

## add error margins of regression line
lines(newx,a[,2], lty=3, type=”c”, col=”blue”)
lines(newx,a[,3], lty=3, type=”c”, col=”blue”)
lines(newx,a[,2], lty=3, type=”l”, col=”blue”)
lines(newx,a[,3], lty=3, type=”l”, col=”blue”)

###########################################################################3
### SPACELOSS