### If you've never used R before, this is an R script file. ### Lines that begin with at least one hashtag are comment lines. ### Anything to the right of a hashtag is ignored by R. ### Lines without hashtags are command lines. These lines must be ### 'run' to give instructions to R. ## If you have never or seldomly used R, consider enrolling in a Data Inquiry Lab Intro to R workshop! ### Fortunately, with the stylo() package, we can mostly ignore command lines. #################################### STYLOMETRY ############################ ## We need to first install the stylo() package. ## Run command lines by highlighting the lines, use shortcut keys commad + return (Mac) or ctrl + R (Windows) ## Or the command lines can be typed directly into the R Console window: ## Run the two lines below install.packages("stylo") ## If prompted to install dependencies, do so -- "Yes" ## To call the package into memory we use the library() function: library(stylo) ## To start the stylo interactive menus, type the following line at the prompt: stylo() #################################### SENTIMENT ANALYSIS ############################ ## We will use two R packages to work on sentiment analysis. ## The first is 'syuzhet', which illustrates the theory and application of sentiment analysis, ## while the second is 'tidytext', which is a broader R ecosystem for text mining. ## There is no graphical user interface for these packages. We have to use command lines: ## Intro to syuzhet: https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html ## And documentation: https://cran.r-project.org/web/packages/syuzhet/syuzhet.pdf ## Install the package: install.packages("syuzhet") library(syuzhet) ### package uses NRC sentiment lexicon: 8 emotions and overall valence. ## http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm ### to calculate nrc sentiment on a text string, use this function: get_nrc_sentiment() get_nrc_sentiment("It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.") ##### Longer texts, or texts with double quotes, require pre-processing ## Let's explore a few functions in the package: ### Set your working directory to the "corpus" subdirectory of the British fiction texts ## We will read in the text of "Austen_Sense.txt", and store it an object "text1" text1<-get_text_as_string("Austen_Sense.txt") ## text1 contains the entire book ## get_sentences() function identifies sentences text1_sentences<-get_sentences(text1) ## Then with the sentences, calculate NRC sentiment valence text1_sentiment<-get_sentiment(text1_sentences) ## But notice we need a smoother over the noisy scores across each line: plot(text1_sentiment, type="l", main="sentiment score by sentence, Sense and Sensibility", xlab="narrative time, by book sentence count", ylab="Emotional valence, NRC lexicon") ## Syuzhet calculates a few smoothers for us: ## One function will calculate a binned mean score, get_percentage_values() ## The number of bins --- chunks of text within which a bin is calculated is set by bins= text1_percent_sentiment<-get_percentage_values(text1_sentiment, bins=10) plot(text1_percent_sentiment, type="l", main="mean 'chunked' sentiment score, Sense and Sensibility", xlab="narrative time, by book sentence count", ylab="Emotional valence, NRC lexicon") ## Of course, some caution is in order: scaling of the y-axis, the meaning of shifts, and artefact of bin width ### One 'automatic' plot type in the package combines a smoothed and raw sentiment score: simple_plot() simple_plot(text1_sentiment, title="smoothed sentence based sentiment in Sense and Sensibility") ###### TK for examples, and object creation, do generic name so that only name of input text has to be changed. ## Another approach, this time with Romeo and Juliet: ## Set the working directory to the corpus subdirectory of Shakespeare: ## File --- Change dir, or on a Mac -- Misc -- Change Working Directory romeo<-readLines("TRA_romeo_1595.txt") ## will read line by line and calculate a summary score on each emotion, by line: sentromeo<-get_nrc_sentiment(romeo) head(sentromeo) ## calculate column totals: sentimentTotals <- data.frame(colSums(sentromeo)) sentimentTotals # A summary count over the columns ### For this visualization, we'll use a more sophisticated graphics package. ## We need to install it first: install.package("ggplot2") library(ggplot2) ## sentiment counts and a bar graph: names(sentimentTotals) <- "count" # label the counts sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals), sentimentTotals) # bind the two columns together rownames(sentimentTotals) <- NULL # remove default rownames prior to plotting, for customization below: ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) + geom_bar(aes(fill = sentiment), stat = "identity") + theme(legend.position = "none") + xlab("Sentiment") + ylab("Total Count") + ggtitle("Total Sentiment Score for Romeo and Juliet") #### Extra material if interested: ### Sentiment analysis of a screenplay -- DS9, S1E1 ###One additional example, if time allows: Use of the tidytext package ### tidytext adopts the 'tidyverse' ecosystem of data management and analysis # install.packages(c('dyplyr', 'stringr', 'maigrittr', 'tidytext')) library(dplyr) library(stringr) ## for plotting figures library(ggplot2) ## occasional use of pipe operator library(magrittr) ## Use of tidytext package for text mining. library(tidytext) ## reading in the file for a sentiment analysis of each ACT DS9e411<-readLines(con="DS9e411.txt") length(DS9e411) # preparing to convert to a tibble for tidy text mining ------------------- ## The text data needs to be a tibble to work with tidytext. ## All sections below start with a tibble: DS9e411DF<-data_frame(line=1:3063, text=DS9e411) ### this way it adds a line to it apparently. DS9e411DF
## dataset is now a tibble; ready for text proceesing with tidytext.

# removal of stop words after creation of tidy text version and mfw bar graphs ------------

DS9e411tidy <- DS9e411DF %>%
  unnest_tokens(word, text)

DS9e411tidy %>%
  anti_join(stop_words)

DS9e411tidy %>%
  anti_join(stop_words) %>%
  count(word, sort=TRUE) %>%
  filter(n > 30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

DS9e411tidy %>%
  anti_join(stop_words) %>%  ## this removes the stop words
  count(word, sort=TRUE) %>%  ## sort by word frequency
  filter(n > 30) %>%  ## filter on n > 30
  mutate(word = reorder(word, n)) %>%  ## creates new variable word, which is ordered
  ggplot() +
  coord_flip() +
  geom_bar(aes(x=word, y=n, color=word, fill=word), stat="identity",
           fill=c("#E1BD6D", "#E4BD7C", "#E8BD8C", "#BDAF89", "#649373", "#0B775E", "#1B5656", "#2C364E", "#5A283E", "#A62C26", "#F2300F"),
           colour=c("#E1BD6D", "#E4BD7C", "#E8BD8C", "#BDAF89", "#649373", "#0B775E", "#1B5656","#2C364E", "#5A283E", "#A62C26", "#F2300F")) +
  labs(title="Most frequent words in Star Trek DS9 E01S01 Screenplay",
       subtitle="function words excluded",
       caption="Quark appears almost twice as frequently as other characters.") +
  theme(legend.position="none")

DS9e411tidysent<- DS9e411tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(index=act, sentiment) %>%
  spread(sentiment, n, fill=0) %>%
  mutate(sentiment = positive - negative)

# GGplot of total sentiment over course of screenplay
###########################################################

ggplot(DS9e411tidysent, aes(x=index, y=sentiment)) +
  geom_bar(aes(fill=sentiment), stat="identity", show.legend=FALSE) +
  # geom_col(show.legend=FALSE) +
  xlab("Script parts, from header to Teaser (0) to Act Five (5)") +
  ylab("Positive or negative sentiment totals") +
  ggtitle("Total Sentiment, positive or negative for DS9 E01_S01 Screenplay")

#########################################################################################################
###################### text importing with Project Gutenberg ############################################
#########################################################################################################

## gutenbergr package will download plain text files from Project Gutenberg
## https://cran.r-project.org/web/packages/gutenbergr/vignettes/intro.html

install.packages("gutenbergr")
library(gutenbergr)

## Pride and Prejudice is Project Gutenberg number 42671
## https://www.gutenberg.org/ebooks/42671

prideandprejudice2 <- gutenberg_download(42671)
prideandprejudice2 