### If you've never used R before, this is an R script file. ### Lines that begin with at least one hashtag are comment lines. ### Anything to the right of a hashtag is ignored by R. ### Lines without hashtags are command lines. These lines must be ### 'run' to give instructions to R. ## If you have never or seldomly used R, consider enrolling in a Data Inquiry Lab Intro to R workshop! ### Fortunately, with the stylo() package, we can mostly ignore command lines. #################################### STYLOMETRY ############################ ## We need to first install the stylo() package. ## Run command lines by highlighting the lines, use shortcut keys commad + return (Mac) or ctrl + R (Windows) ## Or the command lines can be typed directly into the R Console window: ## Run the two lines below install.packages("stylo") ## If prompted to install dependencies, do so -- "Yes" ## To call the package into memory we use the library() function: library(stylo) ## To start the stylo interactive menus, type the following line at the prompt: stylo() #################################### SENTIMENT ANALYSIS ############################ ## We will use two R packages to work on sentiment analysis. ## The first is 'syuzhet', which illustrates the theory and application of sentiment analysis, ## while the second is 'tidytext', which is a broader R ecosystem for text mining. ## There is no graphical user interface for these packages. We have to use command lines: ## Intro to syuzhet: https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html ## And documentation: https://cran.r-project.org/web/packages/syuzhet/syuzhet.pdf ## Install the package: install.packages("syuzhet") library(syuzhet) ### package uses NRC sentiment lexicon: 8 emotions and overall valence. ## http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm ### to calculate nrc sentiment on a text string, use this function: get_nrc_sentiment() get_nrc_sentiment("It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.") ##### Longer texts, or texts with double quotes, require pre-processing ## Let's explore a few functions in the package: ### Set your working directory to the "corpus" subdirectory of the British fiction texts ## We will read in the text of "Austen_Sense.txt", and store it an object "text1" text1<-get_text_as_string("Austen_Sense.txt") ## text1 contains the entire book ## get_sentences() function identifies sentences text1_sentences<-get_sentences(text1) ## Then with the sentences, calculate NRC sentiment valence text1_sentiment<-get_sentiment(text1_sentences) ## But notice we need a smoother over the noisy scores across each line: plot(text1_sentiment, type="l", main="sentiment score by sentence, Sense and Sensibility", xlab="narrative time, by book sentence count", ylab="Emotional valence, NRC lexicon") ## Syuzhet calculates a few smoothers for us: ## One function will calculate a binned mean score, get_percentage_values() ## The number of bins --- chunks of text within which a bin is calculated is set by bins= text1_percent_sentiment<-get_percentage_values(text1_sentiment, bins=10) plot(text1_percent_sentiment, type="l", main="mean 'chunked' sentiment score, Sense and Sensibility", xlab="narrative time, by book sentence count", ylab="Emotional valence, NRC lexicon") ## Of course, some caution is in order: scaling of the y-axis, the meaning of shifts, and artefact of bin width ### One 'automatic' plot type in the package combines a smoothed and raw sentiment score: simple_plot() simple_plot(text1_sentiment, title="smoothed sentence based sentiment in Sense and Sensibility") ###### TK for examples, and object creation, do generic name so that only name of input text has to be changed. ## Another approach, this time with Romeo and Juliet: ## Set the working directory to the corpus subdirectory of Shakespeare: ## File --- Change dir, or on a Mac -- Misc -- Change Working Directory romeo<-readLines("TRA_romeo_1595.txt") ## will read line by line and calculate a summary score on each emotion, by line: sentromeo<-get_nrc_sentiment(romeo) head(sentromeo) ## calculate column totals: sentimentTotals <- data.frame(colSums(sentromeo)) sentimentTotals # A summary count over the columns ### For this visualization, we'll use a more sophisticated graphics package. ## We need to install it first: install.package("ggplot2") library(ggplot2) ## sentiment counts and a bar graph: names(sentimentTotals) <- "count" # label the counts sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals), sentimentTotals) # bind the two columns together rownames(sentimentTotals) <- NULL # remove default rownames prior to plotting, for customization below: ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) + geom_bar(aes(fill = sentiment), stat = "identity") + theme(legend.position = "none") + xlab("Sentiment") + ylab("Total Count") + ggtitle("Total Sentiment Score for Romeo and Juliet") #### Extra material if interested: ### Sentiment analysis of a screenplay -- DS9, S1E1 ###One additional example, if time allows: Use of the tidytext package ### tidytext adopts the 'tidyverse' ecosystem of data management and analysis # install.packages(c('dyplyr', 'stringr', 'maigrittr', 'tidytext')) library(dplyr) library(stringr) ## for plotting figures library(ggplot2) ## occasional use of pipe operator library(magrittr) ## Use of tidytext package for text mining. library(tidytext) ## reading in the file for a sentiment analysis of each ACT DS9e411<-readLines(con="DS9e411.txt") length(DS9e411) # preparing to convert to a tibble for tidy text mining ------------------- ## The text data needs to be a tibble to work with tidytext. ## All sections below start with a tibble: DS9e411DF<-data_frame(line=1:3063, text=DS9e411) ### this way it adds a line to it apparently. DS9e411DF ## dataset is now a tibble; ready for text proceesing with tidytext. # removal of stop words after creation of tidy text version and mfw bar graphs ------------ DS9e411tidy <- DS9e411DF %>% unnest_tokens(word, text) DS9e411tidy %>% anti_join(stop_words) DS9e411tidy %>% anti_join(stop_words) %>% count(word, sort=TRUE) %>% filter(n > 30) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n)) + geom_col() + xlab(NULL) + coord_flip() DS9e411tidy %>% anti_join(stop_words) %>% ## this removes the stop words count(word, sort=TRUE) %>% ## sort by word frequency filter(n > 30) %>% ## filter on n > 30 mutate(word = reorder(word, n)) %>% ## creates new variable word, which is ordered ggplot() + coord_flip() + geom_bar(aes(x=word, y=n, color=word, fill=word), stat="identity", fill=c("#E1BD6D", "#E4BD7C", "#E8BD8C", "#BDAF89", "#649373", "#0B775E", "#1B5656", "#2C364E", "#5A283E", "#A62C26", "#F2300F"), colour=c("#E1BD6D", "#E4BD7C", "#E8BD8C", "#BDAF89", "#649373", "#0B775E", "#1B5656","#2C364E", "#5A283E", "#A62C26", "#F2300F")) + labs(title="Most frequent words in Star Trek DS9 E01S01 Screenplay", subtitle="function words excluded", caption="Quark appears almost twice as frequently as other characters.") + theme(legend.position="none") DS9e411tidysent<- DS9e411tidy %>% inner_join(get_sentiments("bing")) %>% count(index=act, sentiment) %>% spread(sentiment, n, fill=0) %>% mutate(sentiment = positive - negative) # GGplot of total sentiment over course of screenplay ########################################################### ggplot(DS9e411tidysent, aes(x=index, y=sentiment)) + geom_bar(aes(fill=sentiment), stat="identity", show.legend=FALSE) + # geom_col(show.legend=FALSE) + xlab("Script parts, from header to Teaser (0) to Act Five (5)") + ylab("Positive or negative sentiment totals") + ggtitle("Total Sentiment, positive or negative for DS9 E01_S01 Screenplay") ######################################################################################################### ###################### text importing with Project Gutenberg ############################################ ######################################################################################################### ## gutenbergr package will download plain text files from Project Gutenberg ## https://cran.r-project.org/web/packages/gutenbergr/vignettes/intro.html install.packages("gutenbergr") library(gutenbergr) ## Pride and Prejudice is Project Gutenberg number 42671 ## https://www.gutenberg.org/ebooks/42671 prideandprejudice2 <- gutenberg_download(42671) prideandprejudice2 # the input matrix uses Euclidean distances # t() function for transpose # In the dist() function, Euclidean distance is default # Note other options, "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski" ## command dist() # *** SCRATCH A = matrix(  +   c(2, 4, 3, 1, 5, 7), # the data elements  +   nrow=2,              # number of rows  +   ncol=3,              # number of columns  +   byrow = TRUE)   p=c(1,3,5) q=c(2,6,3) r=c(3,6,7) matx<-rbind(p,q,r) matx row.names(matx) matxD<-as.data.frame(matx) names(matxD) scatterplot(matxD$V1 ~ matxD$V2 matxD$V3, reg.line=FALSE, smooth=FALSE) scatterplot3d(matxD$V1, matxD$V2, matxD$V3, xlim=c(0, 10), ylim=c(0,10), xlab="X axis", ylab="Y axis", zlab="Z axis", highlight.3d=TRUE)) matxD matx<-row.names( ) # 2 points, with X, Y coordinates (3,6 and (1,5) p=c(3,6) q=c(1,5) mat2d<-rbind(p,q) mat2d # coordinates in 2D: # Point p, (3,6) q is (1,5) # dmat2d<-as.data.frame(mat2d) pdf(file="Euclid2d.pdf", height=10, width=10) scatterplot(dmat2d$V1, dmat2d$V2, reg.line=FALSE, smooth=FALSE, boxplot=FALSE, xlab="X axis", ylab="Y axis") dev.off() # With Euclidean distance p=c(3,6) q=c(1,5) mat2d<-rbind(p,q) mat2d # calculating distance between the points p and q with # the Euclidean distance() function distmat<-dist(mat2d) distmat # Using the formula for Euclidean distance sqrt((3-1)^2 + (6-5)^2) # provides the same answer. For a 3 D matrix, p=c(1,3,5) q=c(2,6,3) # r=c(3,6,7) p=c(1,3,5) # provide coordinates for point p in 3D q=c(2,6,3) # coordinates for point q in 3D matx<-rbind(p,q) # create a matrix by binding the rows together of p and q matx Dmatx<-dist(matx) # create an object containing the Euclidean distance Dmatx # check calculation sqrt((1-2)^2 + (3-6)^2 + (5-3)^2) matx<-rbind(p,q,r) matx row.names(matx) matxD<-as.data.frame(matx) names(matxD) scatterplot(matxD$V1 ~ matxD$V2 matxD$V3, reg.line=FALSE, smooth=FALSE) pdf(file="3dscatter.pdf", height=10, width=10) scatterplot3d(matxD$V1, matxD$V2, matxD$V3, xlim=c(0, 10), ylim=c(0,10), xlab="X axis", ylab="Y axis", zlab="Z axis") dev.off() library(rgl) plot3d(matxD$V1, matxD$V2, matxD$V3, xlim=c(0, 10), ylim=c(0,10) ) scatter3d(matxD$V1, matxD$V2, matxD$V3, xlim=c(0, 10), ylim=c(0,10), xlab="X axis", ylab="Y axis", zlab="Z axis") matx Dmatx<-dist(matx) Dmatx p=c(1,3,5) q=c(2,6,3) r=c(3,6,7) # adding a 3rd point matx<-rbind(p,q,r) # create a matrix by binding the rows together of p and q matx Dmatx<-dist(matx) # create an object containing the Euclidean distance Dmatx matxD<-as.data.frame(matx) names(matxD) pdf(file="3dscatter2.pdf", height=10, width=10) scatterplot3d(matxD$V1, matxD$V2, matxD$V3, xlim=c(0, 10), ylim=c(0,10), xlab="X axis", ylab="Y axis", zlab="Z axis") dev.off()