## Whitt Kilburn, Data Inquiry Lab ## Social media data analysis with R ## First package we will use is TwitteR ## https://cran.r-project.org/web/packages/twitteR/README.html ##################### twitteR package installation ####################### install.packages("twitteR") # use install.packages("twitteR", repos="http://cran.us.r-project.org") if encounter a repository error library(twitteR) # Get your access creditials, enter those here: consumer_key <- 'your key' consumer_secret <- 'your secret' access_token <- 'your access token' access_secret <- 'your access secret' # # # # # ## Then the following function, setup_twitter_oauth() setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) ## You'll see a message about storing the access credentials in a cache file. ## [1] "Using direct authentication" ## Use a local file ('.httr-oauth'), to cache OAuth access credentials between R sessions? ## Choose `Yes' or `No'. Both work for our purposes. ## Once you set the authorization, you are ready to use functions in twitteR(): ################################################################################### ###### Basic TwitteR() functions ################################################################################### ## Most basic function: search Twitter! ## searchTwitter() is the function, ## There are many search options with Boolean and other operators ## For the options, see https://dev.twitter.com/rest/public/search ## Two obvious examples: '#GVSU' or '@GVSU' ## search Twitter for 50 recent tweets with #GVSU, storing as GV50tweets GV50tweets <- searchTwitter('#GVSU', n=50) head(GV50tweets) # first few lines ## We can convert the tweets to a rectangular dataset (a data frame, GV50tweets_df) and save it as as external file. GV50tweets_df <- do.call(rbind, lapply(GV50tweets, as.data.frame)) write.csv(GV50tweets_df, "GVSU50tweets.csv") # % # # % # % # % # % # % # % # % # # % # # % # % # % # % # % # % # % # # % # # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % # % # % Assignment! # % # % # % # % # % # # % # # % # % # % # % # % # % # % # # 1. Use the searchTwitter() function to find 100 recent tweets with your own search terms. # Store the tweets as object "mytwittersearch". Save the tweets as a CSV. # 2. Open the CSV in Excel; edit, and create a word cloud at http://www.wordclouds.com/ # # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % ## working with re-tweets: ## Do you want to get rid of retweets? Use strip_retweets() function. ## strip_retweets(GV50tweets, strip_manual=TRUE, strip_mt=TRUE) ## By default, strip_retweets will remove retweets -- which were removed with the button, rather than ## by typing directly 'RT'. You can, however, remove those manual retweets too with the option above ## Twitter limits your use of the API. ## Per request, the limit is 3200 tweets ## To see when your limits will reset, run these two lines: rate.limit <- getCurRateLimitInfo(c("lists")) rate.limit ## Load info on a specific user, by their Twitter handle: trump <- getUser('realDonaldTrump') ## this twitteR object can be inspected for various attributes trump$getFavorites(n=5) ## In printout from line below, notice the 'methods': str(trump) ## we can use most of those 'methods': trump$getDescription() ## pulling a user's most recent 20 tweets: trumptimeline<-userTimeline('realDonaldTrump', n=20) ######################### Twitter trends ########################## # "Twitter keeps track of topics that are popular at any given point of time, and # allows one to extract that data. The getTrends function is used to pull current # trend information from a given location, which is specified using a WOEID (see # http://developer.yahoo.com/geo/geoplanet/). Luckily there are two other # functions to help you identify WOEIDs that you might be interested in. The # availableTrendLocations function will return a data.frame with a location # in each row and the woeid giving that location’s WOEID. avail_trends = availableTrendLocations() head(avail_trends) # the whole list of locations, about 500 places: avail_trends ## Find trends in Detroittrends<-getTrends(2391585) head(Detroittrends) Detroittrends ## Or enter a geocoded location (Grand Rapids) for closest approximation close_trends = closestTrendLocations(42.970848, -85.661656) head(close_trends) ## In our case, not useful! ##################################### Twitter timelines ############################################## ## Other functions -- timelines to pull on yourself or other users: # see https://www.rdocumentation.org/packages/twitteR/versions/1.1.9/topics/timelines # userTimeline(user, n=20, maxID=NULL, sinceID=NULL, includeRts=FALSE, excludeReplies=FALSE, ...) # homeTimeline(n=25, maxID=NULL, sinceID=NULL, ...) # mentions(n=25, maxID=NULL, sinceID=NULL, ...) # retweetsOfMe(n=25, maxID=NULL, sinceID=NULL, ...) ############################################################################# # ######################### Text manipulation and tweet visualizations ##### ############################################################################# ## United Airlines #leggingsgate the United Airlines/girls-in-leggings controversy ## load a set of tweets I already collected: load(file=url('http://faculty.gvsu.edu/kilburnw/Unitedsocialmedia.RData')) ## In the workspace, you have a set of tweets already stored for us to work with ## The commands used to pull the tweets are below # Unitedtweets<-searchTwitter('@United', n=3000) ## These United tweets were requested at 13:51, Monday, March 27 head(Unitedtweets) # Unitedhashtag_tweets<-searchTwitter('#unitedairlines', n=3000) ## These United #unitedairlines tweets were requested at 16:01, Monday, March 27 head(Unitedhashtag_tweets) ## The tweets are stored as a dataype "list" in R. ## lists have numbered elements, accessible with double-brackets, such as[[]] ## numbers within the brackets correspond to individual tweets ## On one tweet we can identify several factors: ?status() ## brings up help page on Twitter items onetweet<-Unitedtweets[[750]] ## the object 'onetweet' is the 750th tweet ## start typing the `onetweet$', then an RStudio tooltip shows different characteristics onetweet$getScreenName() onetweet$getText() onetweet$getCreated() # # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % Quick activity break! # % # % # % # 1. Use the functions above to inspect the 1750th tweet! # # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % ## For the text manipulation and visualization, we'll use a few additional packages: install.packages("dplyr") # should already be installed on campus, I think library(dplyr) install.packages("purrr") library(purrr) # install.packages("ggplot2") library(ggplot2) ## Convert tweets into a data frame (rectangular dataset) with the tbl_df function Unitedtweets_df <- tbl_df(map_df(Unitedtweets, as.data.frame)) # The different variables (columns) for the tweets names(Unitedtweets_df) ## Note: You could export these tweets to a CSV file, like previously! ## We used a different set of commands to create a dataframe, but the result is the same. ## Let's plot the most frequent words tweeted. We need to clean up the text first. ## NOTE: The text mining package called 'tm' used to work well with tweets. But recently it has stopped working :( ## So we'll try using a more recent text mining package, 'tidytext'. It's referenced at the end of the handout. ## and here's an overview: https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html install.packages("tidytext") library(tidytext) # We need one more package, the stringr package: # install.packages("stringr") library(stringr) ## The unnest_tokens() function will create a tidytext package dataset. ## Rather than organize text as strings, tidytext uses text in structured format, with rows (tokens) as words or text ## depending on the use of it. It automatically strips punctuation and converts text to lowercase. ## The downside is that it requires the use of regular expressions to search for words. ## tidytext has a standard set of commands to filter out the t.co URLS, etc. and function (stop) words: ## You would run the following 8 lines as a chunk, with your own dataframe of tweets. Replace Unitedtweets_df with your own: replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|RT|https" unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))" tidy_Unitedtweets <- Unitedtweets_df %>% filter(!str_detect(text, "^RT")) %>% mutate(text = str_replace_all(text, replace_reg, "")) %>% unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>% filter(!word %in% stop_words$word, str_detect(word, "[a-z]")) ## The dataset `tidy_Unitedtweets' is our 'tidytext' set of cleaned tweets. tidy_Unitedtweets %>% count(word, sort = TRUE) ## We can now do some frequency plots.. tidy_Unitedtweets %>% count(word, sort = TRUE) %>% filter(n > 50) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n)) + geom_col() + xlab(NULL) + coord_flip() ## The @United is an outlier, of course. We can drop it from the plot with & n < 2000 tidy_Unitedtweets %>% count(word, sort = TRUE) %>% filter(n > 50 & n < 2000) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n)) + geom_col() + xlab(NULL) + coord_flip() ## Finally, with Twitter we'll attempt a lexicon based sentiment analysis ## The tidytext() package incorporates 3 different sentiment lexicons ## One is 'AFINN' http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010 ### Another in multiple languages is 'NRC' http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm get_sentiments("afinn") get_sentiments("nrc") ## So the sentiment analysis will match words (word unigrams) to any in the lexicon that are scored. ## For example, if someone used the term "abhor" in a tweet, it is scored a -3. A total sentiment score ## for a tweet is the summed total for all scored words in the tweet. ## Sentiment analysis is a huge subject, of course, and our approach has limitations: ## It won't understand "not good" as we do, or sarcasm! ## One of the authors of the tidytext package did a sentiment analysis of Trump ## Angrier tweets were sent from his Android phone: ## http://varianceexplained.org/r/trump-tweets/ . ## There's a similar approach with the library(syuzhet) package install.packages("syuzhet") library(syuzhet) ## Example from a Trump tweet! get_nrc_sentiment("If the people of our great country could only see how viciously and inaccurately my administration is covered by certain media!") ## search Twitter for 50 recent tweets with #MAGA MAGA50tweets <- searchTwitter('#MAGA', n=50) head(MAGA50tweets) # first few lines ## We create a dataframe for tweets: MAGA50tweets_df <- do.call(rbind, lapply(MAGA50tweets, as.data.frame)) names(MAGA50tweets_df) # Before processing, we need to remove the graphical characters from the tweets: # We use regular expressions in R: http://stat.ethz.ch/R-manual/R-patched/library/base/html/regex.html ## below, we take out of the dataset the text, replacing ## graphical characters "[^[:graph:]]" with a blank space " " MAGA50tweets_df$text=str_replace_all(MAGA50tweets_df$text,"[^[:graph:]]", " ") ## sentiment scores on tweet text: MAGAsentiment<-get_nrc_sentiment(MAGA50tweets_df$text) ## merge with twitter dataset: MAGA50tweets_df<- cbind(MAGA50tweets_df, MAGAsentiment) names(MAGA50tweets_df) ## We total up sentiment in tweets for emotions sentimentTotals <- data.frame(colSums(MAGA50tweets_df[,c(17:24)])) ## We could also do overall positive and negative ## Here we'll do individual emotions names(sentimentTotals) <- "count" sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals), sentimentTotals) rownames(sentimentTotals) <- NULL ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) + geom_bar(aes(fill = sentiment), stat = "identity") + theme(legend.position = "none") + xlab("Sentiment") + ylab("Total Count") + ggtitle("Total Sentiment Score for MAGA Tweets") # # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % # # % # % # % # % # % # % # % # % Quick activity break! # % # % # % # 1. Use the functions above to process the United tweets # For example, The nrc function doesn't process the seventh tweet: # Unitedtweets[[7]] # 2. visualize the sentiment of the United tweets ################################################################################################### ## Addendem: relatively easy things we haven't covered: ## plotting geocoded tweets based on user location: ## https://bigdataenthusiast.wordpress.com/2016/04/17/visualization-of-tweets-on-google-maps/ # ################# An aside: # We can use R's wordcloud function: install.packages("wordcloud") library(wordcloud) install.packages("tm") library(tm) nographicalchars<-str_replace_all(MAGA50tweets_df$text,"[^[:graph:]]", " ") nographicalcharsCorpus <- Corpus(VectorSource(nographicalchars)) wordcloud(words = nographicalcharsCorpus, scale=c(3,0.5), max.words=40, random.order=FALSE, rot.per=0.10, use.r.layout=FALSE)