## Whitt Kilburn, Data Inquiry Lab
## Social media data analysis with R

## First package we will use is TwitteR
## https://cran.r-project.org/web/packages/twitteR/README.html

 ##################### twitteR package installation #######################
install.packages("twitteR")
#  use install.packages("twitteR", repos="http://cran.us.r-project.org") if encounter a repository error
library(twitteR)

# Get your access creditials, enter those here:
consumer_key <- 'your key'
consumer_secret <- 'your secret'
access_token <- 'your access token'
access_secret <- 'your access secret'

# 
#
#
# 
# 

## Then the following function, setup_twitter_oauth()
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

## You'll see a message about storing the access credentials in a cache file. 
## [1] "Using direct authentication"
## Use a local file ('.httr-oauth'), to cache OAuth access credentials between R sessions?
## Choose `Yes' or `No'. Both work for our purposes.  

## Once you set the authorization, you are ready to use functions in twitteR(): 

###################################################################################
###### Basic TwitteR() functions
###################################################################################
## Most basic function: search Twitter!
## searchTwitter() is the function, 
## There are many search options with Boolean and other operators
## For the options, see https://dev.twitter.com/rest/public/search
## Two obvious examples: '#GVSU' or '@GVSU' 

## search Twitter for 50 recent tweets with #GVSU, storing as GV50tweets 
GV50tweets <- searchTwitter('#GVSU', n=50)

head(GV50tweets) # first few lines

## We can convert the tweets to a rectangular dataset (a data frame, GV50tweets_df) and save it as as external file.  
GV50tweets_df <- do.call(rbind, lapply(GV50tweets, as.data.frame))
write.csv(GV50tweets_df, "GVSU50tweets.csv")

# % # # % # % # % # % # % # % # % #  # % # # % # % # % # % # % # % # % #  # % # # % # % # % # % # % # % # % #  
# % # % # % # % # % # % # % # % # % # %  Assignment! # % # % # % # % # % #  # % # # % # % # % # % # % # % # % #  
# 1. Use the searchTwitter() function to find 100 recent tweets with your own search terms. 
#    Store the tweets as object "mytwittersearch". Save the tweets as a CSV. 
# 2. Open the CSV in Excel; edit, and create a word cloud at http://www.wordclouds.com/
# # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % 
# # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % 

## working with re-tweets:
## Do you want to get rid of retweets? Use strip_retweets() function.  
##  strip_retweets(GV50tweets, strip_manual=TRUE, strip_mt=TRUE)
## By default, strip_retweets will remove retweets -- which were removed with the button, rather than
## by typing directly 'RT'. You can, however, remove those manual retweets too with the option above


## Twitter limits your use of the API. 
## Per request, the limit is 3200 tweets
## To see when your limits will reset, run these two lines:
rate.limit <- getCurRateLimitInfo(c("lists"))
rate.limit


## Load info on a specific user, by their Twitter handle:
trump <- getUser('realDonaldTrump')

## this twitteR object can be inspected for various attributes 
trump$getFavorites(n=5)

## In printout from line below, notice the 'methods':
str(trump)

## we can use most of those 'methods':
trump$getDescription()

## pulling a user's most recent 20 tweets:
trumptimeline<-userTimeline('realDonaldTrump', n=20)

######################### Twitter trends ##########################
# "Twitter keeps track of topics that are popular at any given point of time, and
#   allows one to extract that data. The getTrends function is used to pull current
#   trend information from a given location, which is specified using a WOEID (see
#   http://developer.yahoo.com/geo/geoplanet/). Luckily there are two other
#  functions to help you identify WOEIDs that you might be interested in. The
#  availableTrendLocations function will return a data.frame with a location
#  in each row and the woeid giving that location’s WOEID. 

avail_trends = availableTrendLocations()
head(avail_trends)

# the whole list of locations, about 500 places:
avail_trends

## Find trends in 
Detroittrends<-getTrends(2391585)

head(Detroittrends)

Detroittrends

## Or enter a geocoded location (Grand Rapids) for closest approximation
close_trends = closestTrendLocations(42.970848, -85.661656)
head(close_trends) ## In our case, not useful!

##################################### Twitter timelines ##############################################
## Other functions -- timelines to pull on yourself or other users:
# see https://www.rdocumentation.org/packages/twitteR/versions/1.1.9/topics/timelines
# userTimeline(user, n=20, maxID=NULL, sinceID=NULL, includeRts=FALSE,  excludeReplies=FALSE, ...)
# homeTimeline(n=25, maxID=NULL, sinceID=NULL, ...)
# mentions(n=25, maxID=NULL, sinceID=NULL, ...)
# retweetsOfMe(n=25, maxID=NULL, sinceID=NULL, ...)

#############################################################################
# ######################### Text manipulation and tweet visualizations #####
#############################################################################
## United Airlines #leggingsgate the United Airlines/girls-in-leggings controversy

## load a set of tweets I already collected:
load(file=url('http://faculty.gvsu.edu/kilburnw/Unitedsocialmedia.RData'))

## In the workspace, you have a set of tweets already stored for us to work with
## The commands used to pull the tweets are below
# Unitedtweets<-searchTwitter('@United', n=3000)
## These United tweets were requested at 13:51, Monday, March 27

head(Unitedtweets)

# Unitedhashtag_tweets<-searchTwitter('#unitedairlines', n=3000)
## These United  #unitedairlines tweets were requested at 16:01, Monday, March 27

head(Unitedhashtag_tweets)

## The tweets are stored as a dataype "list" in R. 
## lists have numbered elements, accessible with double-brackets, such as[[]]
## numbers within the brackets correspond to individual tweets

## On one tweet we can identify several factors:

?status() ## brings up help page on Twitter items

onetweet<-Unitedtweets[[750]]
## the object 'onetweet' is the 750th tweet

## start typing the `onetweet$', then an RStudio tooltip shows different characteristics
onetweet$getScreenName()

onetweet$getText()

onetweet$getCreated()

# # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % 
# # % # % # % # % # % # % # % # %  Quick activity break! # % # % # %
# 1. Use the functions above to  inspect the 1750th tweet! 
# # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % 

## For the text manipulation and visualization, we'll use a few additional packages:

install.packages("dplyr") # should already be installed on campus, I think
library(dplyr)

install.packages("purrr")
library(purrr)

# install.packages("ggplot2")
library(ggplot2)

## Convert tweets into a data frame (rectangular dataset) with the tbl_df function
Unitedtweets_df <- tbl_df(map_df(Unitedtweets, as.data.frame))

# The different variables (columns) for the tweets
names(Unitedtweets_df) 

## Note: You could export these tweets to a CSV file, like previously! 
## We used a different set of commands to create a dataframe, but the result is the same.  

## Let's plot the most frequent words tweeted. We need to clean up the text first.
## NOTE: The text mining package called 'tm' used to work well with tweets. But recently it has stopped working :(
## So we'll try using a more recent text mining package, 'tidytext'.  It's referenced at the end of the handout. 
## and here's an overview: https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html
install.packages("tidytext")
library(tidytext)

# We need one more package, the stringr package:
# install.packages("stringr")
library(stringr)

## The unnest_tokens() function will create a tidytext package dataset. 
## Rather than organize text as strings, tidytext uses text in structured format, with rows (tokens) as words or text
## depending on the use of it. It automatically strips punctuation and converts text to lowercase.
## The downside is that it requires the use of regular expressions to search for words. 

## tidytext has a standard set of commands to filter out the t.co URLS, etc. and function (stop) words:
## You would run the following 8 lines as a chunk, with your own dataframe of tweets. Replace Unitedtweets_df with your own:
replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https"
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
tidy_Unitedtweets <- Unitedtweets_df %>% 
  filter(!str_detect(text, "^RT")) %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
  filter(!word %in% stop_words$word,
         str_detect(word, "[a-z]"))

## The dataset `tidy_Unitedtweets' is our 'tidytext' set of cleaned tweets. 

tidy_Unitedtweets %>%
  count(word, sort = TRUE) 

## We can now do some frequency plots..
tidy_Unitedtweets %>%
  count(word, sort = TRUE) %>%
  filter(n > 50) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

## The @United is an outlier, of course. We can drop it from the plot with  & n < 2000
tidy_Unitedtweets %>%
  count(word, sort = TRUE) %>%
  filter(n > 50 & n < 2000) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()


## Finally, with Twitter we'll attempt a lexicon based sentiment analysis
## The tidytext() package incorporates 3 different sentiment lexicons
## One is 'AFINN' http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010
### Another in multiple languages is 'NRC' http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm
get_sentiments("afinn")
get_sentiments("nrc")
## So the sentiment analysis will match words (word unigrams) to any in the lexicon that are scored.
## For example, if someone used the term "abhor" in a tweet, it is scored a -3.  A total sentiment score
## for a tweet is the summed total for all scored words in the tweet.
## Sentiment analysis is a huge subject, of course, and our approach has limitations: 
## It won't understand "not good" as we do, or sarcasm! 

## One of the authors of the tidytext package did a sentiment analysis of Trump
## Angrier tweets were sent from his Android phone: 
## http://varianceexplained.org/r/trump-tweets/ . 

## There's a similar approach with the library(syuzhet) package

install.packages("syuzhet")
library(syuzhet)

## Example from a Trump tweet!
get_nrc_sentiment("If the people of our great country could only see how
                  viciously and inaccurately my administration is covered by certain media!")

## search Twitter for 50 recent tweets with #MAGA 
MAGA50tweets <- searchTwitter('#MAGA', n=50)

head(MAGA50tweets) # first few lines
## We create a dataframe for tweets:
MAGA50tweets_df <- do.call(rbind, lapply(MAGA50tweets, as.data.frame))

names(MAGA50tweets_df)

# Before processing, we need to remove the graphical characters from the tweets:
# We use regular expressions in R: http://stat.ethz.ch/R-manual/R-patched/library/base/html/regex.html

## below, we take out of the dataset the text, replacing 
## graphical characters "[^[:graph:]]" with a blank space " "
MAGA50tweets_df$text=str_replace_all(MAGA50tweets_df$text,"[^[:graph:]]", " ") 

## sentiment scores on tweet text:
MAGAsentiment<-get_nrc_sentiment(MAGA50tweets_df$text)

## merge with twitter dataset: 
MAGA50tweets_df<- cbind(MAGA50tweets_df, MAGAsentiment)

names(MAGA50tweets_df)

## We total up sentiment in tweets for emotions
sentimentTotals <- data.frame(colSums(MAGA50tweets_df[,c(17:24)]))
## We could also do overall positive and negative

## Here we'll do individual emotions
names(sentimentTotals) <- "count"
sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals), sentimentTotals)
rownames(sentimentTotals) <- NULL
ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Total Count") + ggtitle("Total Sentiment Score for MAGA Tweets")

# # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % 
# # % # % # % # % # % # % # % # %  Quick activity break! # % # % # %
# 1. Use the functions above to process the United tweets
#    For example, The nrc function doesn't process the seventh tweet:
#     Unitedtweets[[7]]
# 2. visualize the sentiment of the United tweets
###################################################################################################

## Addendem: relatively easy things we haven't covered: 
## plotting geocoded tweets based on user location: 
## https://bigdataenthusiast.wordpress.com/2016/04/17/visualization-of-tweets-on-google-maps/

# ################# An aside: 
# We can use R's wordcloud function: 
install.packages("wordcloud")
library(wordcloud)
install.packages("tm")
library(tm)
nographicalchars<-str_replace_all(MAGA50tweets_df$text,"[^[:graph:]]", " ") 
nographicalcharsCorpus <- Corpus(VectorSource(nographicalchars))
wordcloud(words = nographicalcharsCorpus, scale=c(3,0.5), max.words=40, random.order=FALSE, 
          rot.per=0.10, use.r.layout=FALSE)