Text Mining

TRB Compendium Papers
TRB Compendium ABSTRACT

TRB Compendium Papers

setwd("/Users/subasishdas1/Copy/Rpubs/rpubs/tm_rpubs")
### bt_tw1 <- read.csv("TRB_AM.csv")
bt_tw1 <- read.csv("TRB_AM5.csv")
names(bt_tw1)
dim(ttt)
dim(bt_tw1)

table(bt_tw1$Year.the.Paper.was.Presented)

tweets <- bt_tw1

all2<- data.frame(Year = unique(tweets$Year.the.Paper.was.Presented), 
Title = tapply(tweets$Paper.Title, tweets$Year.the.Paper.was.Presented, paste, collapse = ' '))
head(all2)
names(all2)




library(tm)
mydata.corpus <- Corpus(VectorSource(all2$Title))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", 
"that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", "set",
"has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west",  "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can","task","type","next","limit","areas","real-","samll","method","case", "california","canada","used","effect","variable","state","factors","zone","analysis","test","methods")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 



mydata.dtm <- TermDocumentMatrix(mydata.corpus) 
dim(mydata.dtm)



DTM <- DocumentTermMatrix(mydata.corpus)



library(wordcloud)
term.matrix <- TermDocumentMatrix(mydata.corpus)
term.matrix <- as.matrix(term.matrix)
head(term.matrix)
colnames(term.matrix) <- c("TRB09-10", "TRB11-12","TRB13-14")
comparison.cloud(term.matrix,max.words=300, random.order=FALSE)


findFreqTerms(mydata.dtm, lowfreq=500)
findAssocs(mydata.dtm, 'traffic', 0.99)
findAssocs(mydata.dtm, 'analysis', 0.99)
findAssocs(mydata.dtm, 'model', 0.98)
findAssocs(mydata.dtm, 'pavement', 0.96)

AUTHOR ORG

tweets =bt_tw1
names(tweets)
all2<- data.frame(Comm_CODE = unique(tweets$Reviewing.Committee.s.Code), 
Title = tapply(tweets$Paper.Title, tweets$Reviewing.Committee.s.Code, paste, collapse = ' '))
head(all2)
names(all2)
unique(tweets$Reviewing.Committee.s.Code)
require(tm)
mydata.corpus <- Corpus(VectorSource(all2$Title))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", 
"that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", 
"has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west",  "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can", "inc", "transportation",  "advanced",               "applied" , "asphalt", "associates", "association", "authority", "center",           "central", "cities", "college" , "commission",       "company",               "construction",     "consultant"  ,    "consultants"  ,    "consulting"     , "corporation",      "council" ,         "county"  ,         "department"    ,  
"development" , "engineering" ,    "group",     "highway", "icf" , "imperial" ,        "inc", "innovation",       "institute",        "international",   "kth" ,            
"laboratory" ,"llc" ,"los" ,"ltd" , "metropolitan","ministry","national" ,            "old"  , "park"  ,"parsons"    ,     "planning"   ,      "polytechnic" ,    
"polytechnique"  ,"resource",         "road" ,            "royal"   ,               
"safety" ,          "santa",            "science"  ,           
"state", "systematics",      "systems" ,        "tech"  ,           "technical" ,       "technological" ,  "technology"   , "toronto"  ,        "transit" ,         "transport" , "transportation"  ,     "united" ,          "universidad",     
"universitat",      "university","america","china","japan","universit","france","spain","itlay","australia", "american","swiss", "services","auckland","urban", "purdue", "portugal", "netherlands", "kingdom","germany", "british","tongji", "southeast", "poland", "south","canada", "sweden","mcgill", "stantec", "indian", "iran", "northwester", "hill","taiwan","italy","delft","jiaotong","french", "trinity","western", "korea", "singapore","jos","federico", "federal","school","tchnico","utrecht")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 

mydata.dtm <- TermDocumentMatrix(mydata.corpus) 
mydata.dtm
dim(mydata.dtm)

DTM <- DocumentTermMatrix(mydata.corpus)
findFreqTerms(mydata.dtm, lowfreq=100)
library(wordcloud)
term.matrix <- TermDocumentMatrix(mydata.corpus)
term.matrix <- as.matrix(term.matrix)
head(term.matrix)
colnames(term.matrix) <- c("TRB09-10", "TRB11-12","TRB13-14")
comparison.cloud(term.matrix,max.words=100, random.order=FALSE)

REVIEW COMMITTE NAME

tweets <- read.csv("TRB_AM_COMM.csv")
names(tweets)
all2<- data.frame(Year = unique(tweets$Year.the.Paper.was.Presented), 
Comm= tapply(tweets$Reviewing.Committee.s.Name, tweets$Year.the.Paper.was.Presented, paste, collapse = ' '))
head(all2)
names(all2)

require(tm)
mydata.corpus <- Corpus(VectorSource(all2$Comm))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", "papers","reveiw",
"that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", "review","subcommittee",
"has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west",  "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can", "inc", "transportation",  "advanced",               "applied" , "asphalt", "associates", "association", "authority", "center",           "central", "cities", "college" , "commission",       "company",               "construction",     "consultant"  ,    "consultants"  ,    "consulting"     , "corporation",      "council" ,         "county"  ,         "department"    ,  
"development" , "engineering" ,    "group",     "highway", "icf" , "imperial" ,        "inc", "innovation",       "institute",        "international",   "kth" ,            
"laboratory" ,"llc" ,"los" ,"ltd" , "metropolitan","ministry","national" ,            "old"  , "park"  ,"parsons"    ,     "planning"   ,      "polytechnic" ,    
"polytechnique"  ,"resource",         "road" ,            "royal"   ,               
"safety" ,          "santa",            "science"  ,           
"state", "systematics",      "systems" ,        "tech"  ,           "technical" ,       "technological" ,  "technology"   , "toronto"  ,        "transit" ,         "transport" , "transportation"  ,     "united" ,          "universidad",     
"universitat",      "university","ahd","paper", "ahn", "special","call","ahd","activities","cfp","toledo","geroliminis")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 

mydata.dtm <- TermDocumentMatrix(mydata.corpus) 
mydata.dtm
dim(mydata.dtm)


ddtm <- as.matrix(mydata.dtm)
write.csv(ddtm, "comm_name.csv")

DTM <- DocumentTermMatrix(mydata.corpus)
findFreqTerms(mydata.dtm, lowfreq=100)
library(wordcloud)
term.matrix <- TermDocumentMatrix(mydata.corpus)
term.matrix <- as.matrix(term.matrix)
head(term.matrix)
colnames(term.matrix) <- c("TRB09-10", "TRB11-12","TRB13-14")
comparison.cloud(term.matrix,max.words=60, random.order=FALSE)

Comm Code

names(tweets)
all2<- data.frame(Year = unique(tweets$Year.the.Paper.was.Presented), 
Code= tapply(tweets$Reviewing.Committee.s.Code, tweets$Year.the.Paper.was.Presented, paste, collapse = ' '))
head(all2)
names(all2)


require(tm)
mydata.corpus <- Corpus(VectorSource(all2$Code))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", 
"that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", 
"has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west",  "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can", "inc", "transportation",  "advanced",               "applied" , "asphalt", "associates", "association", "authority", "center",           "central", "cities", "college" , "commission",       "company",               "construction",     "consultant"  ,    "consultants"  ,    "consulting"     , "corporation",      "council" ,         "county"  ,         "department"    ,  
"development" , "engineering" ,    "group",     "highway", "icf" , "imperial" ,        "inc", "innovation",       "institute",        "international",   "kth" ,            
"laboratory" ,"llc" ,"los" ,"ltd" , "metropolitan","ministry","national" ,            "old"  , "park"  ,"parsons"    ,     "planning"   ,      "polytechnic" ,    
"polytechnique"  ,"resource",         "road" ,            "royal"   ,               
"safety" ,          "santa",            "science"  ,           
"state", "systematics",      "systems" ,        "tech"  ,           "technical" ,       "technological" ,  "technology"   , "toronto"  ,        "transit" ,         "transport" , "transportation"  ,     "united" ,          "universidad",     
"universitat",      "university")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)


mydata.dtm <- TermDocumentMatrix(mydata.corpus) 
mydata.dtm
dim(mydata.dtm)

ddtm <- as.matrix(mydata.dtm)
write.csv(ddtm, "comm_name.csv")


DTM <- DocumentTermMatrix(mydata.corpus)
findFreqTerms(mydata.dtm, lowfreq=5)
library(wordcloud)
term.matrix <- TermDocumentMatrix(mydata.corpus)
term.matrix <- as.matrix(term.matrix)
head(term.matrix)
colnames(term.matrix) <- c("TRB09-10", "TRB11-12","TRB13-14")
comparison.cloud(term.matrix,max.words=100, random.order=FALSE)

TOPIC MODELING

#### for NOLA
require(tm)
mydata.corpus <- Corpus(VectorSource(tweets$Paper.Title))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", 
"that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", 
"has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west",  "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can", "inc", "transportation",  "advanced",               "applied" , "asphalt", "associates", "association", "authority", "center",           "central", "cities", "college" , "commission",       "company",               "construction",     "consultant"  ,    "consultants"  ,    "consulting"     , "corporation",      "council" ,         "county"  ,         "department"    ,  
"development" , "engineering" ,    "group",     "highway", "icf" , "imperial" ,        "inc", "innovation",       "institute",        "international",   "kth" ,            
"laboratory" ,"llc" ,"los" ,"ltd" , "metropolitan","ministry","national" ,            "old"  , "park"  ,"parsons"    ,     "planning"   ,      "polytechnic" ,    
"polytechnique"  ,"resource",         "road" ,            "royal"   ,               
"safety" ,          "santa",            "science"  ,           
"state", "systematics",      "systems" ,        "tech"  ,           "technical" ,       "technological" ,  "technology"   , "toronto"  ,        "transit" ,         "transport" , "transportation"  ,     "united" ,          "universidad",     
"universitat",      "university")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 


# build a term-document matrix
mydata.dtm <- TermDocumentMatrix(mydata.corpus)
mydata.dtm
dim(mydata.dtm)


library(topicmodels)
dtm <- as.DocumentTermMatrix(mydata.dtm)
library(topicmodels)
lda <- LDA(dtm, k = 6) # find 8 topics
term <- terms(lda,4) # first 4 terms of every topic
term

term <- apply(term, MARGIN = 2, paste, collapse = ", ")

# first topic identified for every document (tweet)
require(data.table) #fore IDate

topic <- topics(lda, 1)
topics <- data.frame(Year=tweets$Year.the.Paper.was.Presented, topic)
library(ggplot2)
qplot(Year, ..count.., data=topics, geom="density",
      fill=term[topic], position="stack")+theme_bw()+
  theme(panel.background = element_blank()) +
theme(axis.ticks.x = element_blank())+ theme(axis.text.y = element_text(size = 12),
                                             axis.text.x = element_text(size = 12))

TRB Compendium ABSTRACT

setwd("/Users/subasishdas1/Copy/Rpubs/rpubs/tm_rpubs")
bt_tw1 <- read.csv("TRB_AM.csv")
head(bt_tw1)
names(bt_tw1)
dim(bt_tw1)

tweets <- subset(bt_tw1, Year.the.Paper.was.Presented >2005)
table(tweets$Year.the.Paper.was.Presented)
dim(tweets)


tweets2 <- tweets[sample(nrow(tweets), 3000),]
dim(tweets2)
table(tweets2$Year.the.Paper.was.Presented)
## Sample 10,000 Papers
write.csv(tweets, "10000_pap.csv")

names(tweets2)

all2<- data.frame(Year = unique(tweets2$Year.the.Paper.was.Presented), 
Abstract = tapply(tweets2$Paper.Abstract, tweets2$Year.the.Paper.was.Presented, paste, collapse = ' '))
head(all2)
names(all2)
dim(all2)


require(tm)
mydata.corpus <- Corpus(VectorSource(all2$Abstract))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and",
"that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", 
"has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west",  "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "they", "alf" ,"found","while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 

mydata.dtm <- TermDocumentMatrix(mydata.corpus) 
mydata.dtm
dim(mydata.dtm)

DTM <- DocumentTermMatrix(mydata.corpus)

findFreqTerms(mydata.dtm, lowfreq=2000)
findAssocs(mydata.dtm, 'traffic', 0.98)
findAssocs(mydata.dtm, 'analysis', 0.98)
findAssocs(mydata.dtm, 'model', 0.99)
findAssocs(mydata.dtm, 'evaluation', 0.98)
findAssocs(mydata.dtm, 'pavement', 0.98)

mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.14)
mydata.dtm2
dim(mydata.dtm2)
inspect(mydata.dtm2[1:7,1:7])

library(slam)

TDM.dense <- as.matrix(mydata.dtm2)
TDM.dense
object.size(mydata.dtm2)

write.csv(TDM.dense, "Abs_2000_sparse.csv")

#### PLOT 1
tdm_BT <- read.csv("Abs_2000_sparse_1.csv")
head(tdm_BT)
dim(tdm_BT)

tdm_BT_1 <- tdm_BT[c(1,9 )]
head(tdm_BT_1)
p <- ggplot(tdm_BT_1, aes(x = reorder(Term, -Count), y = Count))
p <- p + geom_bar(stat = "identity")+theme_bw()
p <- p + theme(axis.text.x=element_text(angle=65, hjust=1))
p + labs(title = "Most Frequent Terms")+xlab("Terms")+ 
theme(axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12))



###PLOT 2
library(reshape)
tdm_BTT <- read.csv("Abs_2000_sparse_2.csv")
tdm_BT_2= melt(tdm_BTT[1:8], value.name = "count")
head(tdm_BT_2)
ggplot(tdm_BT_2, aes(x = variable, y = Term, fill = value)) +
geom_tile(colour = "white") +
scale_fill_gradient(high="#FF0000" , low="#FFFFFF")+
ylab("")+ xlab("TRB Annual Meeting")+
theme(panel.background = element_blank()) +
theme(axis.ticks.x = element_blank())+ theme(axis.text.y = element_text(size = 12))

### PLOT 3

library(wordcloud)
term.matrix <- TermDocumentMatrix(mydata.corpus)
term.matrix <- as.matrix(term.matrix)
head(term.matrix)
colnames(term.matrix) <- c("TRB2008","TRB2009", "TRB2010","TRB2011","TRB2012",
"TRB2013","TRB2014")
comparison.cloud(term.matrix,max.words=200, random.order=FALSE)

############# wordcloud
mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.15)
mydata.dtm2
dim(mydata.dtm2)
inspect(mydata.dtm2[1:7,1:7])

library(slam)

TDM.dense <- as.matrix(mydata.dtm2)
TDM.dense
object.size(mydata.dtm2)
object.size(TDM.dense)

inspect(DocumentTermMatrix(mydata.corpus,
list(dictionary = c("traffic", "analysis", "model","evaluation", "pavement"))))

##################################################333333

write.csv(TDM.dense, "sparse_1.csv")

#### PLOT 1
tdm_BT <- read.csv("sparse_3.csv")
head(tdm_BT)
dim(tdm_BT)

tdm_BT_1 <- tdm_BT[c(1,9 )]
head(tdm_BT_1)
library(ggplot2)
p <- ggplot(tdm_BT_1, aes(x = reorder(Term, -Count), y = Count))
p <- p + geom_bar(stat = "identity")+theme_bw()
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p + labs(title = "Most Frequent Terms")+xlab("Terms")+ 
theme(axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12))



###PLOT 2
library(reshape)
tdm_BTT <- read.csv("sparse_3a.csv")
tdm_BT_2= melt(tdm_BTT[1:8], value.name = "count")
head(tdm_BT_2)
ggplot(tdm_BT_2, aes(x = variable, y = Term, fill = value)) +
geom_tile(colour = "white") +
scale_fill_gradient(high="#FF0000" , low="#FFFFFF")+
ylab("")+ xlab("TRB Annual Meeting")+
theme(panel.background = element_blank()) +
theme(axis.ticks.x = element_blank())+ theme(axis.text.y = element_text(size = 12))


####PLOT 3

tdm_BT_3 <- tdm_BT[1:8]
head(tdm_BT_3)
rownames(tdm_BT_3) <- tdm_BT_3$Term
tdm_BT_4 <- tdm_BT_3[2:8]
head(tdm_BT_4)
mydata.df.scale <- scale(tdm_BT_4)
d <- dist(mydata.df.scale, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward.D")
plot(fit) # display dendogram?
rect.hclust(fit, k = 10)

require(tm)
mydata.corpus <- Corpus(VectorSource(tweets2$Paper.Abstract))
mydata.corpus <- tm_map(mydata.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, content_transformer(tolower), mc.cores=1) 
mydata.corpus <- tm_map(mydata.corpus, removePunctuation, preserve_intra_word_dashes=TRUE, mc.cores=1) 
my_stopwords <- c(stopwords('german'),"the", "due", "are", "not", "for", "this", "and", 
"that", "there", "new", "near", "beyond", "time", "from", "been", "both", "than", 
"has","now", "until", "all", "use", "two", "ave", "blvd", "east", "between", "ccc", "end", "have", "avenue", "before", "i-us", "i-e", "i-i-", "ames", "belle", "gen", "okeefe", "one", "just", "mac", "being", "i-i-", "left", "right", "west",  "when","levels","remaining","based", "issues",  "still", "off", "over", "only", "north", "past", "twin", "while",  "i-w" ,  "general" , "harvey", "i-e","i-i-","i-us" , "must", "more", "work","read",  "reached", "morrison",  "mph", "three","info", "canal", "camp", "la-", "approximately",  "amp", "access", "approaching",  "forest", "friday",  "its", "affect", "after", "within", "what", "various", "under", "toward", "san", "other" , "city", "into", "by", "for", "is", "are", "their", "he", "she", "research", "through", "between", "under", "below", "over", "with", "an", "affect", "nowadays", "present", "important", "significant", "then", "using", "having", "via", "vermont", "some", "rap", "how", "can", "inc", "transportation",  "advanced",               "applied" , "asphalt", "associates", "association", "authority", "center",           "central", "cities", "college" , "commission",       "company",               "construction",     "consultant"  ,    "consultants"  ,    "consulting"     , "corporation",      "council" ,         "county"  ,         "department"    , "such", 
"development" , "engineering" ,    "group",     "highway", "icf" , "imperial" ,        "inc", "innovation",       "institute",        "international",   "kth" ,            
"laboratory" ,"llc" ,"los" ,"ltd" , "metropolitan","ministry","national" ,            "old"  , "park"  ,"parsons"    ,     "planning"   ,      "polytechnic" ,    
"polytechnique"  ,"resource",         "road" ,            "royal"   ,               
"safety" ,          "santa",            "science"  ,           
"state", "systematics",      "systems" ,        "tech"  ,           "technical" ,       "technological" ,  "technology"   , "toronto"  ,        "transit" ,         "transport" , "transportation"  ,     "united" ,          "universidad",     
"universitat",      "university", "different", "study", "results", "were","which", "these", "used", "paper", "test", "bus", "proposed")
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords, mc.cores=1)
mydata.corpus <- tm_map(mydata.corpus, removeNumbers, mc.cores=1) 


# build a term-document matrix
mydata.dtm <- TermDocumentMatrix(mydata.corpus)
mydata.dtm
dim(mydata.dtm)


library(topicmodels)
dtm <- as.DocumentTermMatrix(mydata.dtm)
library(topicmodels)
lda <- LDA(dtm, k = 8) # find 8 topics
term <- terms(lda,6) # first 4 terms of every topic
term

term <- apply(term, MARGIN = 2, paste, collapse = ", ")

# first topic identified for every document (tweet)
require(data.table) #fore IDate

topic <- topics(lda, 1)
topics <- data.frame(Year=tweets$Year.the.Paper.was.Presented, topic)
library(ggplot2)
qplot(Year, ..count.., data=topics, geom="density",
      fill=term[topic], position="stack")+theme_bw()+
  theme(panel.background = element_blank()) +
theme(axis.ticks.x = element_blank())+ theme(axis.text.y = element_text(size = 12),
                                             axis.text.x = element_text(size = 12))

############# wordcloud
library(wordcloud)
tdm <- TermDocumentMatrix(mydata.corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
wordcloud(d$word,d$freq)

pal <- brewer.pal(9,"BuGn")
pal <- pal[-(1:4)]
wordcloud(d$word,d$freq,c(8,.3),2,,FALSE,,.15,pal)

pal <- brewer.pal(6,"Dark2")
pal <- pal[-(1)]
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,,.15,pal)



#random colors
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,TRUE,.15,pal)

##### with font #####
wordcloud(d$word,d$freq,c(8,.3),2,,TRUE,,.15,pal,
vfont=c("gothic english","plain"))
wordcloud(d$word,d$freq,c(8,.3),2,100,TRUE,,.15,pal,vfont=c("script","plain"))
wordcloud(d$word,d$freq,c(8,.3),2,100,TRUE,,.15,pal,vfont=c("serif","plain"))

Conducted by: Subasish Das

Text Mining

July 22, 2015

TRB Compendium Papers

AUTHOR ORG

REVIEW COMMITTE NAME

Comm Code

TOPIC MODELING

TRB Compendium ABSTRACT