sample = c("You're not crazy and I love you very much.") y <- sentiment(sample, n.before = 4, n.after=2, amplifier.weight=1)
mean(y$sentiment)
y <- sentiment(sample, n.before = Inf, n.after=Inf, amplifier.weight=1)
mean(y$sentiment)
Create a corpus of Warren Buffet’s letters for the 2008-2010 letters.
require(stringr) require(tm) #set up a data frame to hold up to 5 letters df <- data.frame(num=5) begin <- 2008 # letters in the range 2008-2012 i <- begin # read the letters while (i < 2013) { y <- as.character(i) # create the file name f <- str_c('http://www.richardtwatson.com/BuffettLetters/',y, 'ltr.txt',sep='') # read the letter as on large string d <- readChar(f,nchars=1e6) # add letter to the data frame df[i-begin+1,] <- d i <- i + 1 } # create the corpus letters <- Corpus(DataframeSource(as.data.frame(df), encoding = "UTF-8"))
What is the Flesch-Kincaid score for the 2010 letter?
require(koRpus) #tokenize the first letter in the corpus tagged.text <- tokenize(letters[[3]], format="obj",lang="en") # score readability(tagged.text, "Flesch.Kincaid", hyphen=NULL,force.lang="en")
Create a term-document matrix and find the words occurring more than 150 times in the letters for 2008-2102. Do appropriate preprocessing.
# convert to lower clean.letters <- tm_map(letters,tolower) # remove punctuation clean.letters <- tm_map(clean.letters,removePunctuation) # remove numbers clean.letters <- tm_map(clean.letters,removeNumbers) # remove stop words clean.letters <- tm_map(clean.letters,removeWords,stopwords('SMART')) # strip extra white space clean.letters <- tm_map(clean.letters,stripWhitespace) # stem the document -- takes a while to run
stem.letters <- tm_map(clean.letters,stemDocument, language = "english")
# stem completion -- takes a while to run
stem.letters <- tm_map(stem.letters,stemCompletion, dictionary=clean.letters) # create term document matrix -- one row for each term and one column for each document tdm <- TermDocumentMatrix(clean.letters,control = list(minWordLength=3)) dim(tdm) findFreqTerms(tdm, lowfreq = 150,highfreq = Inf)
Report the frequency of the 20 most frequent words. Do several runs to identify words that should be removed from the top 20 and remove them.
# Create a term document matrix
tdm <- TermDocumentMatrix(stem.letters)
# convert term document matrix to a regular matrix to get frequencies of words
m <- as.matrix(tdm)
# sort on frequency of terms to get frequencies of words
v <- sort(rowSums(m), decreasing=TRUE)
# display the 20 most frequent words
v[1:20] # continue the process by removing words
Produce a word cloud for the words identified in the prior exercise.
library(wordcloud)
# select the color palette
pal = brewer.pal(5,"Accent")
# generate the cloud based on the 30 most frequent words
wordcloud(d$word, d$freq, min.freq=d$freq[30],colors=pal)
Select a word and compute its association with other words in the Buffett letters corpus. Adjust the correlation coefficient to get about 10 words
# compute the associations
findAssocs(tdm, "insurance",0.90)
Review the documentation of the hclust function in the stats package and try one or two other clustering techniques.
require(ggplot2)
require(ggdendro)
# setup the document term matrix
tdm <- TermDocumentMatrix(clean.letters)
# name the columns for the letter's year
colnames(tdm) <- 2008:2012
# Remove sparse terms
tdm1 <- removeSparseTerms(tdm, 0.5)
# transpose the matrix
tdmtranspose <- t(tdm1)
cluster = hclust(dist(tdmtranspose),method='centroid')
# get the clustering data
dend <- as.dendrogram(cluster)
# plot the tree
ggdendrogram(dend,rotate=T)
This page is part of the promotional and support material for Data Management (open edition) by Richard T. Watson |