### If you've never used R before, this is an R script file. 
### Lines that begin with at least one hashtag are comment lines.
### Anything to the right of a hashtag is ignored by R. 
### Lines without hashtags are command lines. These lines must be 
### 'run' to give instructions to R.  

## If you have never or seldomly used R, consider enrolling in a Data Inquiry Lab Intro to R workshop!


### Fortunately, with the stylo() package, we can mostly ignore command lines.
#################################### STYLOMETRY ############################

## We need to first install the stylo() package. 
## Run command lines by highlighting the lines, use shortcut keys commad + return (Mac) or ctrl + R (Windows)
## Or the command lines can be typed directly into the R Console window:

## Run the two lines below
install.packages("stylo")
## If prompted to install dependencies, do so -- "Yes"


## To call the package into memory we use the library() function:
library(stylo)

## To start the stylo interactive menus, type the following line at the prompt:
stylo()


#################################### SENTIMENT ANALYSIS ############################
## We will use two R packages to work on sentiment analysis. 
## The first is 'syuzhet', which illustrates the theory and application of sentiment analysis, 
## while the second is 'tidytext', which is a broader R ecosystem for text mining.  


## There is no graphical user interface for these packages. We have to use command lines:
## Intro to syuzhet: https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html
## And documentation: https://cran.r-project.org/web/packages/syuzhet/syuzhet.pdf

## Install the package:
install.packages("syuzhet")
library(syuzhet)

### package uses NRC sentiment lexicon: 8 emotions and overall valence.
## http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm

### to calculate nrc sentiment on a text string, use this function: get_nrc_sentiment()
get_nrc_sentiment("It is a truth universally acknowledged, 
                  that a single man in possession of a good fortune, must be in want of a wife.")


##### Longer texts, or texts with double quotes, require pre-processing
## Let's explore a few functions in the package:
### Set your working directory to the "corpus" subdirectory of the British fiction texts
## We will read in the text of "Austen_Sense.txt", and store it an object "text1"
text1<-get_text_as_string("Austen_Sense.txt")

## text1 contains the entire book

## get_sentences() function identifies sentences
text1_sentences<-get_sentences(text1)
 
## Then with the sentences, calculate NRC sentiment valence 
text1_sentiment<-get_sentiment(text1_sentences)

## But notice we need a smoother over the noisy scores across each line:
plot(text1_sentiment, type="l", main="sentiment score by sentence, Sense and Sensibility", xlab="narrative time, by book sentence count", ylab="Emotional valence, NRC lexicon")

## Syuzhet calculates a few smoothers for us:
## One function will calculate a binned mean score, get_percentage_values()
## The number of bins --- chunks of text within which a bin is calculated is set by bins=
text1_percent_sentiment<-get_percentage_values(text1_sentiment, bins=10)
plot(text1_percent_sentiment, type="l", main="mean 'chunked' sentiment score, Sense and Sensibility", xlab="narrative time, by book sentence count", ylab="Emotional valence, NRC lexicon")

## Of course, some caution is in order: scaling of the y-axis, the meaning of shifts, and artefact of bin width


### One 'automatic' plot type in the package combines a smoothed and raw sentiment score: simple_plot()
simple_plot(text1_sentiment, title="smoothed sentence based sentiment in Sense and Sensibility")
###### TK for examples, and object creation, do generic name so that only name of input text has to be changed. 


## Another approach, this time with Romeo and Juliet: 

## Set the working directory to the corpus subdirectory of Shakespeare:
## File --- Change dir, or on a Mac -- Misc -- Change Working Directory 
romeo<-readLines("TRA_romeo_1595.txt")

## will read line by line and calculate a summary score on each emotion, by line:
sentromeo<-get_nrc_sentiment(romeo)
head(sentromeo)

## calculate column totals:
sentimentTotals <- data.frame(colSums(sentromeo)) 
sentimentTotals # A summary count over the columns


### For this visualization, we'll use a more sophisticated graphics package. 
## We need to install it first:

install.package("ggplot2")
library(ggplot2)

## sentiment counts and a bar graph:
names(sentimentTotals) <- "count" # label the counts
sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals), sentimentTotals) # bind the two columns together
rownames(sentimentTotals) <- NULL # remove default rownames prior to plotting, for customization below:
ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
geom_bar(aes(fill = sentiment), stat = "identity") +
theme(legend.position = "none") +
xlab("Sentiment") + ylab("Total Count") + ggtitle("Total Sentiment Score for Romeo and Juliet")


#### Extra material if interested:
### Sentiment analysis of a screenplay -- DS9, S1E1
###One additional example, if time allows: Use of the tidytext package
### tidytext adopts the 'tidyverse' ecosystem of data management and analysis


# 
install.packages(c('dyplyr', 'stringr', 'maigrittr', 'tidytext'))
library(dplyr)
library(stringr)

## for plotting figures
library(ggplot2)

## occasional use of pipe operator
library(magrittr)

## Use of tidytext package for text mining. 
library(tidytext)


## reading in the file for a sentiment analysis of each ACT
DS9e411<-readLines(con="DS9e411.txt")
length(DS9e411)


# preparing to convert to a tibble for tidy text mining -------------------
## The text data needs to be a tibble to work with tidytext. 
## All sections below start with a tibble:

DS9e411DF<-data_frame(line=1:3063, text=DS9e411) ### this way it adds a line to it apparently.
DS9e411DF ## dataset is now a tibble; ready for text proceesing with tidytext. 


# removal of stop words after creation of tidy text version and mfw bar graphs ------------

DS9e411tidy <- DS9e411DF %>%
  unnest_tokens(word, text)

DS9e411tidy %>%
  anti_join(stop_words)

DS9e411tidy %>%
  anti_join(stop_words) %>%
  count(word, sort=TRUE) %>%
  filter(n > 30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

DS9e411tidy %>%
  anti_join(stop_words) %>% ## this removes the stop words
  count(word, sort=TRUE) %>% ## sort by word frequency
  filter(n > 30) %>% ## filter on n > 30
  mutate(word = reorder(word, n)) %>% ## creates new variable word, which is ordered
  ggplot() + 
  coord_flip() +
  geom_bar(aes(x=word, y=n, color=word, fill=word), stat="identity", fill=c("#E1BD6D", "#E4BD7C", "#E8BD8C", 
           "#BDAF89", "#649373", "#0B775E", "#1B5656", "#2C364E", "#5A283E", "#A62C26", "#F2300F"), 
           colour=c("#E1BD6D", "#E4BD7C", "#E8BD8C", "#BDAF89", "#649373", "#0B775E", "#1B5656","#2C364E", "#5A283E", "#A62C26", "#F2300F")) + 
  labs(title="Most frequent words in Star Trek DS9 E01S01 Screenplay", subtitle="function words excluded", caption="Quark appears almost twice as frequently as other characters.") + 
  theme(legend.position="none")


DS9e411tidysent<- DS9e411tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(index=act, sentiment) %>%
  spread(sentiment, n, fill=0) %>%
  mutate(sentiment = positive - negative)

# GGplot of total sentiment over course of screenplay ###########################################################
ggplot(DS9e411tidysent, aes(x=index, y=sentiment)) +
  geom_bar(aes(fill=sentiment), stat="identity", show.legend=FALSE) +
  #  geom_col(show.legend=FALSE) + 
  xlab("Script parts, from header to Teaser (0) to Act Five (5)") + 
  ylab("Positive or negative sentiment totals") + 
  ggtitle("Total Sentiment, positive or negative for DS9 E01_S01 Screenplay") 


#########################################################################################################
###################### text importing with Project Gutenberg ############################################
#########################################################################################################
## gutenbergr package will download plain text files from Project Gutenberg
## https://cran.r-project.org/web/packages/gutenbergr/vignettes/intro.html
install.packages("gutenbergr")
library(gutenbergr)

## Pride and Prejudice is Project Gutenberg number 42671
## https://www.gutenberg.org/ebooks/42671

prideandprejudice2 <- gutenberg_download(42671)  
 
prideandprejudice2


# the input matrix uses Euclidean distances
# t() function for transpose
# In the dist() function, Euclidean distance is default
# Note other options, "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"

## command dist() 
# *** SCRATCH
A = matrix( 
  +   c(2, 4, 3, 1, 5, 7), # the data elements 
  +   nrow=2,              # number of rows 
  +   ncol=3,              # number of columns 
  +   byrow = TRUE)  

p=c(1,3,5)
q=c(2,6,3)
r=c(3,6,7)

matx<-rbind(p,q,r)
matx
row.names(matx)

matxD<-as.data.frame(matx)
names(matxD)
scatterplot(matxD$V1 ~ matxD$V2  matxD$V3, reg.line=FALSE, smooth=FALSE)
scatterplot3d(matxD$V1, matxD$V2, matxD$V3, xlim=c(0, 10), ylim=c(0,10), xlab="X axis", ylab="Y axis", zlab="Z axis", highlight.3d=TRUE))

matxD

matx<-row.names( )

# 2 points, with X, Y coordinates (3,6 and (1,5)
p=c(3,6)
q=c(1,5)
mat2d<-rbind(p,q)
mat2d


# coordinates in 2D: 
# Point p, (3,6) q is (1,5)
# 
dmat2d<-as.data.frame(mat2d)
pdf(file="Euclid2d.pdf", height=10, width=10)
scatterplot(dmat2d$V1, dmat2d$V2, reg.line=FALSE, smooth=FALSE, boxplot=FALSE, xlab="X axis", ylab="Y axis")
dev.off()

# With Euclidean distance

p=c(3,6)
q=c(1,5)
mat2d<-rbind(p,q)
mat2d

# calculating distance between the points p and q with 
# the Euclidean distance() function
distmat<-dist(mat2d)
distmat

# Using the formula for Euclidean distance
sqrt((3-1)^2 + (6-5)^2)

# provides the same answer.  

For a 3 D matrix, 

p=c(1,3,5)
q=c(2,6,3)
# r=c(3,6,7)


p=c(1,3,5) # provide coordinates for point p in 3D
q=c(2,6,3) # coordinates for point q in 3D
matx<-rbind(p,q) # create a matrix by binding the rows together of p and q
matx
Dmatx<-dist(matx) # create an object containing the Euclidean distance
Dmatx

# check calculation
sqrt((1-2)^2 + (3-6)^2 + (5-3)^2)

matx<-rbind(p,q,r)
matx
row.names(matx)

matxD<-as.data.frame(matx)
names(matxD)
scatterplot(matxD$V1 ~ matxD$V2  matxD$V3, reg.line=FALSE, smooth=FALSE)

pdf(file="3dscatter.pdf", height=10, width=10)
scatterplot3d(matxD$V1, matxD$V2, matxD$V3, xlim=c(0, 10), ylim=c(0,10), xlab="X axis", ylab="Y axis", zlab="Z axis")
dev.off()


library(rgl)
plot3d(matxD$V1, matxD$V2, matxD$V3,  xlim=c(0, 10), ylim=c(0,10) )

scatter3d(matxD$V1, matxD$V2, matxD$V3,  xlim=c(0, 10), ylim=c(0,10), xlab="X axis", ylab="Y axis", zlab="Z axis")

matx
Dmatx<-dist(matx)
Dmatx


p=c(1,3,5)
q=c(2,6,3)
r=c(3,6,7) # adding a 3rd point


matx<-rbind(p,q,r) # create a matrix by binding the rows together of p and q
matx
Dmatx<-dist(matx) # create an object containing the Euclidean distance
Dmatx


matxD<-as.data.frame(matx)
names(matxD)
pdf(file="3dscatter2.pdf", height=10, width=10)
scatterplot3d(matxD$V1, matxD$V2, matxD$V3, xlim=c(0, 10), ylim=c(0,10), xlab="X axis", ylab="Y axis", zlab="Z axis")
dev.off()