#########################################################
# Script Name: WebinarScript.R (R examples for webinar)#
# Author: Dan Hall #
#########################################################
# Comments: remainder of line after hashtag is a comment
# The code we entered at the R prompt:
3*4
4:6
2*(4:6)
(1:3)*(4:6)
(1:2)*(4:6)
sqrt(9)
pnorm(1.96,lower.tail=FALSE)
################################
# More R code and an example : #
################################
N <- 17 # an assignment creating an R object called N containing 17
N # print out the value of N (a sample size for what follows)
quizScores <- c(8,10,6,6,8,10,10,6,10,8,10,4,10,6,10,8,2) #a vector of numbers
# could have computed N from the length of quizScores:
(N <- length(quizScores)) #() around an assignment prints the value
# results for the quiz:
(quizAvg <- mean(quizScores)); (quizSD <- sd(quizScores))
fivenum(quizScores) # gives min, Q1, median Q3, max of the data in quizScores
# now produce a box plot of the quiz scores:
boxplot(quizScores)
# add a title (submit "?title" for help on title function) :
title(main="Quiz Scores, Quiz 1", # commands can continue over multiple lines
ylab="Score (out of 10)", # and comments can come at the end of any line
sub="Mean=7.8, Median=8") # even within a function!
# In R, we can write our own functions. Suppose I wanted a function to give the
# intra-quartile range (distance between the 75th and 25th percentiles
# (Q3 and Q1). Here's a function that will do that:
iqr <- function(x){
fivenumx <- fivenum(x)
iqrange <- fivenumx[4]-fivenumx[2]
return(iqrange)
}
iqr(quizScores) # Test of our function
# In fact, there is a built-in R function called IQR() that is a more
# sophisticated version of what we just wrote.
IQR(quizScores)
# Now suppose there are undergrads and grad students in the class:
studType <- c(1,2,2,2,1,1,2,1,1,1,1,2,2,1,2,1,2) # 1=Undergrad, 2=Grad
# Do undergrads do better/worse on the quiz? Compute 2-sample t test:
N1 <- sum(2-studType) ; N2 <- N-N1;
ybar1 <- mean(quizScores[studType==1])
ybar2 <- mean(quizScores[studType==2])
var1 <- var(quizScores[studType==1])
var2 <- var(quizScores[studType==2])
varPooled <- ((N1-1)*var1+(N2-1)*var2)/(N-2)
( tstat <- (ybar1-ybar2)/sqrt( varPooled*(1/N1+1/N2) ) )
( pval <- 2*(1-pt(abs(tstat),df=N-2)) )
# Not significant evidence of a difference in mean performance on quiz.
# Isn't there an easier way? Yes, of course:
t.test(quizScores[studType==1],quizScores[studType==2],var.equal=TRUE)
# At present, the quiz scores and student types are in separate data vectors
# It is usually better to pull data from related variables together in a single
# "data frame". Such an object is the analog of a SAS data set.
quizData <- data.frame(quizScores, studType)
head(quizData,3) # The head() function shows the first few rows of a data frame
str(quizData) # the str() function shows the structure of a data frame
quizData$quizScores # dollar sign ($) accesses variables in a data frame
# quizScores and studType are both numeric variables. For some purposes it is
# useful to treat studType as a factor (analog of a variable on a SAS CLASS
# statement). So let's create a new version of studType that is a factor:
quizData$studTypeFac <- factor(quizData$studType,levels=1:2,labels=c("U","G"))
str(quizData)
# Now we can recompute t test using a different syntax:
t.test(quizScores ~ studTypeFac, data=quizData,var.equal=TRUE)
# Side-by-side boxplots of the quiz scores are a nice complement to our t test:
boxplot(quizScores ~ studTypeFac, data=quizData,
main="Quiz scores by student type",ylab="Quiz score (out of 10)",
xlab="Student type")
# Add a horizontal reference line (lty=3 means line type 3, which is dotted):
abline(h=6.5,lty=3,col=2) # col=2 is red (could use lty="dotted",col="red")
# and add a legend:
legend("bottomleft",lty=3,col=2,legend="P/F cut-off")
# Get a data frame from the MASS package for regression example:
data(Cars93,package="MASS") # cars reviewed in CR mag. in 1993
# Fit a simple linear regression of MPG on car weight
model1 <- lm(MPG.city~Weight,data=Cars93)
coef(model1) #extract & print estimated regression coefficients
confint(model1) # print 95% CIs for regression coefficients
summary(model1) # summarize the fitted model
# Recall the quizData data frame. Here is its summary:
summary(quizData)
# Data modes. First let's add some variables to quizData:
quizData$lastName <- c("Jones","Smith","Chen","Hanson","Butkus",
"Bradshaw","Rao","Matthews","Johnson",
"Deng","Lee","Davis","Wang","Nair",
"Samson","Ng","Harrison")
quizData$pass <- quizData$quizScores>=6.5
head(quizData,3)
# Some of the different modes in R:
mode(quizData$quizScores)
mode(quizData$pass)
mode(quizData$lastName)
mode(boxplot)
###################
# Reading In Data #
###################
### Use of read.table() ###
cigData <- read.table(
file="http://ww2.amstat.org/publications/jse/datasets/cigarettes.dat.txt",
header=FALSE,
col.names=c("brand","tar","nicotine","weight","CO"),
stringsAsFactors=FALSE) #Don't turn char vars into factors.
head(cigData)
##################
# Importing Data #
##################
# To import data from a file from another software package,
# we can use the foreign package. We must first install it
# (use Install packages... option in Tools menu) and then
# load it with the library function:
library(foreign) # must install it first
# There are several read.xxx() functions in the foreign package including
# read.dta() - reads Stata files
# read.spss() - reads SPSS files
# read.mtp() - reads Minitab portable worksheet files
# read.xport() - reads SAS Transport files
# Here is an example of how to import a Stata file using
# read.dta(). The data file is leinhard.dta. It contains
# data on infant mortality rates and per-capita incomes in
# various countries. A description of the data set can be
# found in leinhard.cbk.txt. Both files can be found at
# www.ats.ucla.edu/stat/stata/examples/ara/arastata_dl.htm
leinhard <- read.dta(file="http://www.ats.ucla.edu/stat/stata/examples/ara/leinhard.dta")
head(leinhard,3)
# And here's how you would import a permanent SAS data set
# (a .sas7bdat file) using read.sas7bdat() from the sas7bdat
# package:
library(sas7bdat) # must install it first
leinhard2 <- read.sas7bdat(file="./leinhard.sas7bdat")
head(leinhard2,3)
#######################
# Data Structures #
#######################
## Vectors ##
# Some vectors:
(petNames <- c("Heidi","Lexi","Purr","Princess")) # a character vector
( species <- c("dog","dog","cat","cat"))
( one2six <- 1:6 )
( fiveZeros <- rep(0,5) )
(sixRanUnis <- runif(6,0,1)) # 6 uniformly dist'd random numbers in (0,1)
( dummy.5 <- (sixRanUnis>0.5) ) # a logical vector
( emptyVec <- vector(mode="numeric",length=5)) # initialized to 0
# A vector has only one dimension: its length.
length(petNames)
length(sixRanUnis)
## Matrices ##
# Some matrices:
( petInfo <- cbind(petNames,species) ) # a character matrix
( M2x6 <- rbind(one2six,sixRanUnis))
( diag123 <- diag(c(1,2,3)) )
(seq2x3 <- matrix(1:6,nrow=2))
( one2sixMat <- t(as.matrix(one2six)) )
is.matrix(one2six); is.matrix(one2sixMat)
# Retrieving dimensions:
dim(one2sixMat)
dim(M2x6)
ncol(M2x6)
nrow(M2x6)
length(M2x6) # returns the total number of elements
#############
## Lists ##
#############
### Making Lists ###
# some lists:
( family.list <- list( members=c("Dad","Mom", "Johnny", "Suzy", "Fred"),
ages=c(42, 41, 11, 9, 5),
pets=c("Spot", "Rover", "Fuzz") ) ) # used names here
season.list <- list(c("Clemson","South Carolina","North Texas","LSU",
"Tennessee","Missouri","Vanderbilt","Florida",
"Appalachian State","Auburn","Kentucky","Georgia Tech",
"Nebraska"),
c(8,5),c("TaxSlayer.com Bowl") )
names(season.list) <- c("Opponents","Record","BowlGame")
season.list
### Accessing List Components ###
# A good reason to know about lists is that many functions return
# their results as a list. In addition, the summary function typically
# returns its results as a list. So even if you don't make a list yourself
# you need to know how to use a list (e.g., extract information from a list).
# E.g., consider the regression model we used before: a linear
# regression of MPG.city on Weight. That model was fit with the
# lm function, which returns the details of the fitted model
# as a list.
model1 <- lm(MPG.city~Weight,data=Cars93)
mode(model1)
# Most lists returned by functions are named lists, which means that each
# component in the list has a name. Therefore, we can see what is in the list
# by checking what names were given to the list components. This can be
# done with the names() function:
names(model1)
names(family.list)
# Some of these names suggest pretty clearly what is in the corresponding
# components of the list (e.g., residuals). Others are more cryptic (e.g.,
# assign)
# We can print out the whole list model1 if we want, but it is long.
# Instead we can look at particular components of the list in the same way
# that we look at components of a data frame: with the $ operator followed
# by the name of the component we are interested in:
model1$coefficients # the regression parameter estimates
# The summary function returns a list too. It takes a fitted regression object
# (the list we just looked at) and produces a second list that contains more
# information from the fitted model we might want to know.
model1.sum <- summary(model1)
names(model1.sum)
# From this list, we get other quantities we might be interested in. E.g.,
# The Rsquared value from the fitted model:
model1.sum$r.squared # same value printed by summary(model1)
####################################
# Subscripts and Logical Operators #
####################################
## Subscripts ##
# Let's define a named vector:
planetMasses <- c(Mercury=3.3022e23,Venus=4.8685e24,Earth=5.9736e24,
Mars=6.4185e23,Jupiter=1.8986e27,Saturn=5.6846e26,
Uranus=8.6810e25,Neptune=10.243e25)
# Lots of ways to subscript vectors:
planetMasses[1] # with a numeric index
planetMasses[c("Venus","Earth")] # with a name or vector of names
planetMasses[c(1,3)] # with a numeric index vector
planetMasses[-c(1,3)] # with a negative numeric index vector (omits values)
planetMasses[planetMasses>1e26] # with a logical vector
# Recall our data frame quizData:
head(quizData)
# Lots of ways to subscript matrices and data frames:
quizData[!quizData$pass,] # data from students who failed (! is "NOT" operator)
quizData[!quizData$pass & quizData$studType==1,] # data from undergrads who failed
quizData[!quizData$pass & quizData$studType==1,"lastName"] # names of undergrads who failed
## Logical Operators ##
# Above we used > and == (with the nchar() and subsrt() functions) to subscript
# petNames. Note that in these examples and in general, the vector being
# subscripted can appear in the subscript vector. Here are some other of
# logical operators and subscripting with them.
petNames[species=="cat"]; petNames[species!="dog"]
petNames[species=="cat" | species=="dog"]
petNames[species=="cat" & species=="dog"]
petNames[!(species=="cat" & species=="dog")]
# How element-wise comparisons work:
Y <- rep(c(NA,FALSE,TRUE),each=3); Z <- rep(c(NA,FALSE,TRUE),3)
(truthTable <- data.frame(Y=Y,Z=Z,YandZ=Y&Z,YorZ=Y|Z,YxorZ=xor(Y,Z)))
# How && and || work:
U <- c(TRUE,FALSE,TRUE,FALSE); V < c(TRUE,TRUE,FALSE,TRUE)
cbind(U,V)
U&&V # returns FALSE because U and V are not both TRUE for every element
U||V # RETURNS TRUE because at least one of U and V is TRUE for every element
#########################
## The plot() function ##
#########################
# The most basic use of plot() is to produce a scatter plot of y versus x.
# This can be done with any of the following syntaxes:
# plot(x,y)
# plot(y ~ x)
# plot(xyMat) where xyMat is a matrix with x in its first column, y in 2nd column
# plot(xyList) where xyList has components named "x" and "y"
par(mfrow=c(3,1)) # put plots in a 3 x 1 arrangement on the page
plot(Cars93$Weight,Cars93$MPG.city,main="Syntax 1")
plot(MPG.city ~ Weight,data=Cars93,main="Syntax 2") # Same result
plot(cbind(Cars93$Weight,Cars93$MPG.city),main="Syntax 3") # Same result
# Create variable domestic 1=domestic, 2=foreign
par(mfrow=c(1,1))# reset to 1 plot per page
domestic <- 2-as.numeric(as.character(Cars93$Origin)=="USA")
plot(MPG.city ~ Weight,data=Cars93,
main="Mileage vs Weight\n 1993 CR Cars Data",
ylab="MPG (city)",xlab="Weight (lbs.)",
col=domestic,pch=domestic)
abline(model1)
legend("topright",col=c(1,2,1),pch=c(1,2,NA),lty=c(NA,NA,1),
legend=c("Domestic","Foreign","LS Fit"),
bty="n") # no box around the legend
par(mfrow=c(2,2))
plot(model1,ask=FALSE) # generates model diagnostic plots
plot(MPG.city~Type+Price+EngineSize+MPG.highway,data=Cars93,ask=FALSE)
par(mfrow=c(1,1))
pairs(~MPG.city+Price+EngineSize+MPG.highway,data=Cars93)