Kodi Weatherholtz
kweatherholtz@ling.ohio-state.edu
“ggplot2 is a plotting system for R, based on the grammar of graphics, which tries to take the good parts of base and lattice graphics and none of the bad parts. It takes care of many of the fiddly details that make plotting a hassle (like drawing legends) as well as providing a powerful model of graphics that makes it easy to produce complex multi-layered graphics.”
Hadley Wickham, developer, http://ggplot2.org/
“ggplot2 is designed to work in a layered fashion, starting with a layer showing the raw data then adding layers of annotations and statistical summaries.”
Wickham, 2009, ggplot2: Elegant Graphics for Data Analysis
in base R:
– graphics are composed of simple raw elements (e.g., points, lines)
– it is difficult to add new components to existing graphics (e.g., error bars, legends)
ggplot:
– graphics involve high-level elements (representations of raw data, statistical transformations)
– elements are easily combined to form complex graphics
## even if you already have ggplot installed, you should reinstall it, as some
## function and argument names have changed with the release of version 9.2
# install.packages('ggplot2')
require(ggplot2)
Basic Examples
a data frame
a set of aesthetic mappings
– describe how variables in a data frame are mapped to graphical attributes
– x- and y-axis variables, colo(u)rs, subset groupings, linetypes
one or more geometric objects , called geoms
– determine how values are rendered graphically
– points, lines, boxplots, bars, etc.
statistical transformations, called stats
– e.g., regression lines, bootstrapped confidence intervals, log transform of scale and coordinate systems
one or more facets to create lattice/trellis plots
– e.g., plot each subject’s data individually
a coordinate system, or coord
– default is Cartesian coordinates, but other systems can be specified
– e.g., polar coords, log transformed coords, map projections (latitude and longitude)
scales
– e.g., adjust size of points in a scatterplot to be proportional on a specified scale
# mtcars is a built in data set.
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# Let's tidy it up a bit
# turn rownames (car model) into vector, and select a subset of columns
df1 <- data.frame(model = as.factor(row.names(mtcars)),
mtcars[,c(11,1:2,4,6)])
row.names(df1) <- NULL
head(df1)
model carb mpg cyl hp wt
1 Mazda RX4 4 21.0 6 110 2.620
2 Mazda RX4 Wag 4 21.0 6 110 2.875
3 Datsun 710 1 22.8 4 93 2.320
4 Hornet 4 Drive 1 21.4 6 110 3.215
5 Hornet Sportabout 2 18.7 8 175 3.440
6 Valiant 1 18.1 6 105 3.460
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_point()
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_boxplot()
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_violin()
# adjust formatting of x- and y- axes.
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_violin() +
# change font size, font color, and axis label position
theme(axis.text = element_text(colour="black", size=15),
axis.title.x= element_text(face="bold", size=18, vjust=-0.45),
axis.title.y = element_text(size=18, vjust=.2),
axis.ticks = element_line(colour="red"))
my.theme <- theme(axis.text = element_text(colour="black", size=15),
text = element_text(size=16),
title = element_text(size=19),
axis.title.x= element_text(vjust=-0.45),
axis.title.y = element_text(vjust=.2),
axis.ticks = element_line(colour="black"),
axis.line = element_line())
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_violin() +
geom_point() + # add a layer showing raw data points
my.theme # use the custom theme we just defined
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_violin() +
geom_point(position=position_jitter(width=.1)) + # add jitter to points
my.theme
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_violin() +
geom_dotplot(binaxis = "y", stackdir = "center") + # specify which axis to use for 'binning'
my.theme
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_violin() +
geom_dotplot(binaxis = "y", stackdir = "center", dotsize = .65) + # change size of dots
my.theme
Basic Embellishments
by definition, boxplots display median and interquartile ranges, but not means
– ggplots are built in layers, so it’s easy to add a layer for descriptive statistics like means
scatterplots display values for multiple continuous variables
– it’s also easy to add a regression line to show direction and size of correlation
all plots can be annotated with text
ggplot(data = df1, aes(x = cyl, y = mpg, group = cyl)) +
geom_boxplot() +
# fun.y=mean calculates the mean of the y-axis variable (mpg) for each group.
# geom="point" indicates means should be plotted as points over top of the boxplots.
stat_summary(fun.y = mean, geom = "point", color = "red", size = 3) +
my.theme
ggplot(data = df1, aes(x = hp, y = wt)) +
geom_point() + # make scatterpot
geom_smooth() + # add loess line
scale_x_continuous("gross horsepower") + # change axis labels and limits
scale_y_continuous("weight (lb/1000)", limits = c(1,6)) +
my.theme
ggplot(data = df1, aes(x = hp, y = wt)) +
geom_point() +
geom_smooth(se = FALSE) + # remove error ribbon
scale_x_continuous("gross horsepower") +
scale_y_continuous("weight (lb/1000)", limits = c(1,6)) +
my.theme
ggplot(data = df1, aes(x = hp, y = wt)) +
geom_point() +
stat_smooth(geom = "pointrange") + # note the change to stat_smooth()
scale_x_continuous("gross horsepower") +
scale_y_continuous("weight (lb/1000)", limits = c(1,6)) +
my.theme
ggplot(data = df1, aes(x = hp, y = wt)) +
geom_point() +
geom_smooth(method = "lm", formula = y~(x)) +
scale_x_continuous("gross horsepower") +
scale_y_continuous("weight (lb/1000)", limits = c(1,6)) +
my.theme
ggplot(data = df1, aes(x = hp, y = wt)) +
geom_point() +
geom_smooth(method = "lm", formula = y~(x)) +
scale_x_continuous("gross horsepower") +
scale_y_continuous("weight (lb/1000)", limits = c(1,6)) +
# specify the text (i.e., label) and the x- and y-coordinates for text placement
annotate(geom = "text", x = 250, y = 2, label = "R2 = ???, p = ???", color = "red") +
my.theme
if your data set is continually changing (say, as you run more participants), you must continually hand correct the annotations
this is cumbersome, and it’s easy to make mistakes
SOLUTION: supply the label argument with output directly from regression model
let’s see an example
# Recall, we had previously specified geom_smooth(method='lm', formula=y~(x)).
# The call below produces the same model, just outside of ggplot.
my.model <- summary(lm(wt ~ hp, data = df1))
print(my.model)
Call:
lm(formula = wt ~ hp, data = df1)
Residuals:
Min 1Q Median 3Q Max
-1.4176 -0.5312 -0.0204 0.4254 1.5645
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.83825 0.31652 5.81 2.4e-06 ***
hp 0.00940 0.00196 4.80 4.1e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.748 on 30 degrees of freedom
Multiple R-squared: 0.434, Adjusted R-squared: 0.415
F-statistic: 23 on 1 and 30 DF, p-value: 4.15e-05
# extract the R-squared value for the weight~horsepower correlation
my.model.r2 = my.model$r.squared
print(my.model.r2)
[1] 0.4339
# combine R-squared value into a descriptive string
my.exp <- paste("R2 =", my.model.r2, sep = " ")
print(my.exp)
[1] "R2 = 0.433948779081183"
ggplot(data = df1, aes(x = hp, y = wt)) +
geom_point() +
geom_smooth(method = "lm", formula = y~(x)) +
scale_x_continuous("gross horsepower") +
scale_y_continuous("weight (lb/1000)", limits = c(1,6)) +
# We no longer specify the annotation text locally.
# The text comes directly from the regression model.
annotate(geom = "text", x = 250, y = 2, label = my.exp, color = "red")+
my.theme
lm_eqn = function(df1){
m = lm(wt ~ hp, df1);
eq <- substitute(italic(y) == a + b %.% italic(x)*","
~~italic(r)^2~"="~r2~~", p < "~~p,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
p = format(coef(summary(m))[8], digits = 3, scientific=FALSE)
))
as.character(as.expression(eq));
}
Credit goes to Ramnath Vaidyanathan for this function. Go here for the original post on StackOverflow.
ggplot(data = df1, aes(x = hp, y = wt)) +
geom_point() +
geom_smooth(method = "lm", formula = y~(x)) +
scale_x_continuous("gross horsepower") +
scale_y_continuous("weight (lb/1000", limits = c(1,6)) +
annotate(geom = "text", x = 250, y = 2, label = lm_eqn(df1), parse = T, color = "red") +
my.theme
Grouping and Statistically Transforming Data
df2 <- subset(diamonds, carat > 0.5)
head(df2)
carat cut color clarity depth table price x y z
91 0.70 Ideal E SI1 62.5 57 2757 5.70 5.72 3.57
92 0.86 Fair E SI2 55.1 69 2757 6.45 6.33 3.52
93 0.70 Ideal G VS2 61.6 56 2757 5.70 5.67 3.50
94 0.71 Very Good E VS2 62.4 57 2759 5.68 5.73 3.56
95 0.78 Very Good G SI2 63.8 56 2759 5.81 5.85 3.72
96 0.70 Good E VS2 57.5 58 2759 5.85 5.90 3.38
nrow(df2)
[1] 35008
ggplot(data = df2, aes(x = carat, y = price)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_continuous("carat") +
scale_y_continuous("price")+
my.theme
ggplot(data = df2, aes(x = log10(carat), y = log10(price))) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_continuous("log10(carat)") +
scale_y_continuous("log10(price)") +
my.theme
ggplot(data = df2, aes(x = carat, y = price)) +
geom_point() +
geom_smooth(method = "lm") +
coord_trans(x = "log10", y = "log10") +
scale_x_continuous("log10(carat)") +
scale_y_continuous("log10(price)") +
my.theme
ggplot(data=df2, aes(x=carat, y=price, colour=cut, fill=cut)) +
geom_point() +
geom_smooth(method = "lm") +
coord_trans(x = "log10", y = "log10") +
scale_x_continuous("log10(carat)") +
scale_y_continuous("log10(price)") +
my.theme
ggplot(data=df2, aes(x=carat, y=price, colour=cut, fill=cut)) +
geom_point(alpha = .05) + # number btw 0 (transparent) and 1 (opaque)
geom_smooth(method = "lm") +
coord_trans(x = "log10", y = "log10") +
scale_x_continuous("log10(carat)") +
scale_y_continuous("log10(price)") +
my.theme
ggplot(data=df2, aes(x=carat, y=price, colour=cut, fill=cut)) +
geom_point(alpha = .05) +
geom_smooth(method = "lm") +
coord_trans(x = "log10", y = "log10") +
scale_x_continuous("log10(carat)") +
scale_y_continuous("log10(price)") +
guides(fill = guide_legend(reverse = TRUE), colour = guide_legend(reverse = TRUE)) +
my.theme
# plot number of tokens (counts) that fall within a specific bin
ggplot(data=df2, aes(x=price, fill=cut, colour=cut)) +
geom_histogram(alpha=.8) +
guides(fill = guide_legend(reverse = TRUE), colour = guide_legend(reverse = TRUE)) +
my.theme
# plot the probability that a variable will take on a given value
ggplot(data=df2, aes(x=price, fill=cut, colour=cut)) +
geom_density(alpha=.5) +
guides(fill = guide_legend(reverse = TRUE), colour = guide_legend(reverse = TRUE)) +
my.theme
# the trick here is to change the y-axis for the histrogram
# from counts to probability density
ggplot(data=df2, aes(x=price)) +
geom_histogram(aes(y=..density..), alpha=.5) +
geom_density(size=1.25) +
my.theme
ggplot(data=df2, aes(x=price)) +
geom_histogram(aes(y=..density..), alpha=.5) +
geom_density(size=1.25, adjust=1.9) +
my.theme
ggplot(data=df2, aes(x=price, fill=..count..)) +
geom_histogram() +
scale_fill_continuous(low="violet", high="violetred4") +
my.theme
# define subset for easy reference
df2.sub <- subset(df2, cut %in% c("Ideal","Good") & price < 8000)
ggplot(data=df2.sub, aes(x=price, colour=cut)) +
geom_histogram(aes(y=..density.., fill=cut), alpha=.3) +
geom_density(size=1.5) +
facet_grid(cut ~ .) +
my.theme
# calculate means of each group
df2.sub.means <- ddply(df2.sub, .(cut),
summarise, mean.price=mean(price))
Error: could not find function "ddply"
print(df2.sub.means)
Error: object 'df2.sub.means' not found
ggplot(data=df2.sub, aes(x=price, colour=cut)) +
geom_histogram(aes(y=..density.., fill=cut), alpha=.3) +
geom_density(size=1.5) +
geom_vline(data=df2.sub.means, aes(xintercept=mean.price),
linetype="dashed", size=1, colour="red") + # add means as dashed lines
facet_grid(cut ~ .) + # plot subgroups as separate "facets"
my.theme
Error: object 'df2.sub.means' not found
Some Cosmetic Details
ggplot(mtcars, aes(x = factor(cyl), fill = factor(cyl))) +
geom_bar() +
my.theme
ggplot(mpg, aes(x = trans, fill = trans)) +
geom_bar() +
my.theme
ggplot(mtcars, aes(x = factor(cyl), fill = factor(cyl))) +
geom_bar() +
scale_fill_manual(values=c("red", "green", "blue")) +
my.theme
ggplot(mtcars, aes(x = factor(cyl), fill = factor(cyl))) +
geom_bar() +
scale_fill_manual(values=c("#980000", "#009800", "#000098")) +
my.theme
ggplot(mtcars, aes(x = factor(cyl), fill = factor(cyl))) +
geom_bar() +
scale_fill_brewer() +
my.theme
ggplot(mtcars, aes(x = factor(cyl), fill = factor(cyl))) +
geom_bar() +
scale_fill_brewer(palette="YlOrRd")+
my.theme
ggplot(mtcars, aes(x = factor(cyl), fill = factor(cyl))) +
geom_bar() +
scale_fill_grey() +
my.theme
ggplot(mtcars, aes(x = factor(cyl), fill = factor(cyl))) +
geom_bar() +
scale_fill_grey("Cylinders") +
my.theme
ggplot(mtcars, aes(x = factor(cyl), fill = factor(cyl))) +
geom_bar() +
scale_fill_grey("Cylinders") +
theme(legend.position="bottom", # move legend below x-axis
legend.title = element_text(colour="red", face="italic", size=19),
legend.text = element_text(size=17),
axis.text.x = element_text(colour="red", face="bold", size=17),
axis.title.x = element_blank(), # remove x-axis label
panel.background = element_blank(), # remove panel grid
axis.line = element_line() # add 'l-shaped' axis line
)
Linguistic Data
Part 1: Formant Values and Vowel Plots
# acoustic measurements of 8 reps of 10 vowels from 72 speakers (adults and
# children)
pb <- read.delim("http://ling.upenn.edu/courses/cogs501/pb.Table1", header = FALSE)
# Give meaningful column names
colnames(pb) <- c("Age", "Sex", "Speaker", "Vowel", "V1", "F0.hz", "F1.hz", "F2.hz",
"F3.hz")
# Relabel the age data
levels(pb$Age) <- c("Child", "Adult", "Adult")
pb$Age <- relevel(pb$Age, "Adult")
# Clean up speaker labeling
pb$Speaker <- as.factor(paste("S", pb$Speaker, sep = ""))
head(pb)
Age Sex Speaker Vowel V1 F0.hz F1.hz F2.hz F3.hz
1 Adult m S1 iy i 160 240 2280 2850
2 Adult m S1 iy i 186 280 2400 2790
3 Adult m S1 ih \\ic 203 390 2030 2640
4 Adult m S1 ih \\ic 192 310 1980 2550
5 Adult m S1 eh \\ep 161 490 1870 2420
6 Adult m S1 eh \\ep 155 570 1700 2600
# note we can select a subset from within the call to ggplot
ggplot(data=subset(pb, Speaker=="S2"), aes(x=F2.hz, y=F1.hz, label=Vowel)) +
geom_point() +
geom_text() + # geom_text() labels points. Label must be specified as an aesthetic
scale_y_reverse() + # reverse the axes
scale_x_reverse() +
my.theme
# note we can select a subset from within the call to ggplot
ggplot(data=subset(pb, Speaker=="S2"), aes(x=F2.hz, y=F1.hz, label=Vowel)) +
geom_point() +
geom_text(hjust=1.1, vjust=-.2, size=5.5) + # adjust placement and size of labels
scale_y_reverse() +
scale_x_reverse() +
my.theme
ggplot(data=subset(pb, Speaker %in% c("S1","S2","S3","S4")), aes(x=F2.hz, y=F1.hz, label=Vowel)) +
geom_text(hjust=1, vjust=-.2) +
scale_y_reverse() +
scale_x_reverse() +
facet_wrap(~ Speaker) + # this line provides functionality similar to the lattice package
my.theme
ggplot(data=pb, aes(x=F2.hz, y=F1.hz, label=Vowel, colour=Vowel)) +
geom_text(hjust=1, vjust=-.2) +
scale_y_reverse() +
scale_x_reverse() +
facet_wrap(~ Sex + Age) + # specify multiple variables for faceting
my.theme
Note: some male-adult values are being cut off
ggplot(data=pb, aes(x=F2.hz, y=F1.hz, label=Vowel, colour=Vowel)) +
geom_text(hjust=1, vjust=-.2) +
scale_y_reverse() +
scale_x_reverse() +
coord_cartesian(ylim=c(0, 1500)) +
facet_wrap(~ Sex + Age) +
my.theme
There are many ways to adjust axis limits, but when axes are reversed, it’s best to use coord_cartesian()
ggplot(data=subset(pb, Sex=="f"), aes(x=F2.hz, y=F1.hz, label=Vowel, colour=Age)) +
geom_text() +
scale_y_reverse() +
scale_x_reverse() +
my.theme
… we could plot shaded regions showing the total area of the child’s formant space vs. the adult’s space?
as it turns out, geom_polygon can do just that
first, we just need to find the envelope (i.e., convex hull) for adults and children
require(plyr)
# chull() is a function for finding the 'envelope' of a set of data points.
# Here we use this function to find the extreme points for a 2-dimensional
# space (F1-F2)
find_hull <- function(df) {
df[chull(df$F2.hz, df$F1.hz), ]
}
# apply this function to find the envelope for females by Age (adult vs.
# children)
hulls <- ddply(subset(pb, Sex == "f"), .(Age), find_hull)
print(hulls)
Age Sex Speaker Vowel V1 F0.hz F1.hz F2.hz F3.hz
1 Adult f S54 iy i 220 220 2850 3800
2 Adult f S37 ah \\vt 213 280 850 2500
3 Adult f S61 ah \\vt 260 290 670 2380
4 Adult f S61 ah \\vt 275 330 630 2460
5 Adult f S59 uh \\hs 246 420 590 3100
6 Adult f S58 ao \\ct 250 950 1130 3160
7 Adult f S53 ao \\ct 267 987 1172 3180
8 Adult f S40 ao \\ct 180 1040 1300 3000
9 Adult f S48 ae \\ae 222 1110 2160 2700
10 Adult f S40 iy i 200 300 3100 3400
11 Child f S63 iy i 290 320 3500 4260
12 Child f S68 ah \\vt 260 310 730 3500
13 Child f S68 ah \\vt 274 360 660 3050
14 Child f S62 ah \\vt 200 400 650 3800
15 Child f S66 uh \\hs 285 770 940 3750
16 Child f S76 ao \\ct 350 1190 1470 3150
17 Child f S62 ao \\ct 200 1300 1800 3450
18 Child f S67 ae \\ae 214 1240 2700 3640
19 Child f S67 iy i 227 590 3610 4220
ggplot(data=subset(pb, Sex=="f"), aes(x=F2.hz, y=F1.hz, label=Vowel, colour=Age, fill=Age)) +
geom_text() +
geom_polygon(data=hulls, alpha=.4) +
scale_y_reverse() +
scale_x_reverse() +
my.theme
Linguistic Data
Part 2: Response Times, Barplots, and Error Bars
standard error of the mean equals std. deviation (\(\sigma\)) divided by the square root of the sample size (N)
\[ SE_{\bar{x}} = \sigma / \sqrt{N} \]
which is equivalent to square root of variance (\(s^2\)) divided by square root of sample size (N)
\[ \sqrt{s^2 / N} \]
std.error <- function(x, na.rm = T) {
sqrt(var(x, na.rm = na.rm)/length(x[complete.cases(x)]))
}
# read in fake data
rt.dat <- read.delim("datasets/fakedata.txt", header = T)
head(rt.dat)
Subject Item IV1 IV2 Response Freq RT
1 S1 t_01 A 2 1 0.24 1100
2 S1 t_02 A 2 1 5.22 983
3 S1 t_03 A 2 1 5.10 1108
4 S1 t_04 A 2 1 5.77 984
5 S1 t_05 A 2 1 2.37 1340
6 S1 t_06 A 2 1 5.80 1192
# How many data points per subjet per condition
table(rt.dat$IV1, rt.dat$IV2)/length(unique(rt.dat$Subject))
2 3 4
A 8 8 8
B 8 8 8
require(plyr)
# use ddply() to calculate condition means for each subject, averaging over items
mean.bySubj <- ddply(rt.dat, .(Subject, IV1, IV2), summarise,
"mean.RT" = mean(RT))
head(mean.bySubj, 18)
Subject IV1 IV2 mean.RT
1 S1 A 2 1150
2 S1 A 3 1309
3 S1 A 4 1453
4 S1 B 2 1067
5 S1 B 3 1479
6 S1 B 4 1694
7 S10 A 2 1245
8 S10 A 3 1318
9 S10 A 4 1636
10 S10 B 2 1136
11 S10 B 3 1467
12 S10 B 4 1914
13 S11 A 2 1234
14 S11 A 3 1418
15 S11 A 4 1679
16 S11 B 2 1102
17 S11 B 3 1435
18 S11 B 4 1894
grand.means <- ddply(mean.bySubj, .(IV1, IV2), summarise,
"grand.RT" = mean(mean.RT),
"se" = std.error(mean.RT))
print(grand.means)
IV1 IV2 grand.RT se
1 A 2 713.9 76.75
2 A 3 885.9 76.33
3 A 4 1152.3 77.98
4 B 2 721.9 74.06
5 B 3 1019.1 81.86
6 B 4 1445.9 77.44
ggplot(data = grand.means, aes(x = factor(IV2), y = grand.RT, fill = IV1)) +
geom_bar(position = position_dodge(.9)) + # position bars side-by-side (cf. stacked)
geom_errorbar(position = position_dodge(.9), width = .25,
aes(ymin = grand.RT-se, ymax = grand.RT+se)) + # ymax and ymin are group mean +/- std. error
my.theme
ggplot(data = grand.means, aes(x = factor(IV2), y=grand.RT, group=IV1, colour=IV1)) +
geom_point() + # use points and lines instead of geom_bar()
geom_line() +
# note: we no longer need to specify position_dodge for the errorbars
geom_errorbar(width = .15, aes(ymin = grand.RT-se, ymax = grand.RT+se)) +
my.theme
ggplot(data = grand.means, aes(x = factor(IV2), y=grand.RT, group=IV1, colour=IV1)) +
geom_point() +
geom_line(linetype="dashed") +
geom_errorbar(width = .15, aes(ymin = grand.RT-se, ymax = grand.RT+se)) +
my.theme
# immediately calculate condition means and error from raw data,
# without first averaging over subjs
means.wrong <- ddply(rt.dat, .(IV1, IV2), summarise, "mean.RT" = mean(RT))
error.wrong <- ddply(rt.dat, .(IV1,IV2), summarise, "se" = std.error(RT))
# merge means and error into one frame
wrong.grand <- merge(means.wrong, error.wrong, by=c("IV1", "IV2"), all=T)
print(wrong.grand)
IV1 IV2 mean.RT se
1 A 2 713.9 28.56
2 A 3 885.9 29.06
3 A 4 1152.3 30.37
4 B 2 721.9 27.55
5 B 3 1019.1 31.51
6 B 4 1445.9 32.02
The error bars became substantially smaller. Why?
Recall that when calculating standard error of the mean correctly, the denominator is the sample size of the data set (e.g., number of participants)
Hence, we want to calculate error using a data frame containing one data point per subject per condition (i.e., each subject’s mean performance in that condition)
If we calculate standard error over the whole data set (as in the second example above), instead of over subject means, then the denominator is huge! Which leads to a tiny error measurement.
Linguistic Data
Part 3: Binomial DVs and Proportions
## lexdec is a canned lexical decision data set from Harold Baayen and Jen Hay
## available in the package languageR
# install.packages('languageR')
require(languageR)
# numeric coding of of whether response was correct or incorrect
lexdec$Correct.num <- ifelse(lexdec$Correct == "correct", 1, 0)
# simplify column names
colnames(lexdec)[c(5, 10)] <- c("NativeLang", "Freq")
# select specific subset for illustrative purposes
subjs <- c("A1", "A2", "C", "K", "R3", "S", "T1", "J", "M2", "T2", "V", "Z")
lex.sub <- subset(lexdec, Subject %in% subjs)
str(lex.sub)
'data.frame': 948 obs. of 29 variables:
$ Subject : Factor w/ 21 levels "A1","A2","A3",..: 1 1 1 1 1 1 1 1 1 1 ...
$ RT : num 6.34 6.31 6.35 6.19 6.03 ...
$ Trial : int 23 27 29 30 32 33 34 38 41 42 ...
$ Sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
$ NativeLang : Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 1 1 ...
$ Correct : Factor w/ 2 levels "correct","incorrect": 1 1 1 1 1 1 1 1 1 1 ...
$ PrevType : Factor w/ 2 levels "nonword","word": 2 1 1 2 1 2 2 1 1 2 ...
$ PrevCorrect : Factor w/ 2 levels "correct","incorrect": 1 1 1 1 1 1 1 1 1 1 ...
$ Word : Factor w/ 79 levels "almond","ant",..: 55 47 20 58 25 12 71 69 62 1 ...
$ Freq : num 4.86 4.61 5 4.73 7.67 ...
$ FamilySize : num 1.386 1.099 0.693 0 3.135 ...
$ SynsetCount : num 0.693 1.946 1.609 1.099 2.079 ...
$ Length : int 3 4 6 4 3 10 10 8 6 6 ...
$ Class : Factor w/ 2 levels "animal","plant": 1 1 2 2 1 2 2 1 2 2 ...
$ FreqSingular: int 54 69 83 44 1233 26 50 63 11 24 ...
$ FreqPlural : int 74 30 49 68 828 31 65 47 9 42 ...
$ DerivEntropy: num 0.791 0.697 0.475 0 1.213 ...
$ Complex : Factor w/ 2 levels "complex","simplex": 2 2 2 2 2 1 1 2 2 2 ...
$ rInfl : num -0.31 0.815 0.519 -0.427 0.398 ...
$ meanRT : num 6.36 6.42 6.34 6.34 6.3 ...
$ SubjFreq : num 3.12 2.4 3.88 4.52 6.04 3.28 5.04 2.8 3.12 3.72 ...
$ meanSize : num 3.48 3 1.63 1.99 4.64 ...
$ meanWeight : num 3.18 2.61 1.21 1.61 4.52 ...
$ BNCw : num 12.06 5.74 5.72 2.05 74.84 ...
$ BNCc : num 0 4.06 3.25 1.46 50.86 ...
$ BNCd : num 6.18 2.85 12.59 7.36 241.56 ...
$ BNCcRatio : num 0 0.708 0.568 0.713 0.68 ...
$ BNCdRatio : num 0.512 0.497 2.202 3.591 3.228 ...
$ Correct.num : num 1 1 1 1 1 1 1 1 1 1 ...
ggplot(data = lex.sub, aes(x=Freq, y=Correct.num, colour=NativeLang, fill=NativeLang)) +
geom_point() +
# fit data with a logistic regression line
geom_smooth(method="glm", family="binomial", formula=y~(x)) +
scale_y_continuous("Proportion correct") +
scale_x_continuous("Word Frequency") +
my.theme
ggplot(data = lex.sub, aes(x=Freq, y=Correct.num, colour=NativeLang, fill=NativeLang)) +
geom_point() +
geom_smooth() +
scale_y_continuous("Proportion correct") +
scale_x_continuous("Word Frequency") +
my.theme
# calculate proportion correct for each subject as a function of whether the
# previous type was a word or nonword
props.bySubj <- ddply(lex.sub, .(Subject, NativeLang, PrevType), function(x) {
sum(x$Correct.num)/nrow(x)
})
# change name of the proportion columns
names(props.bySubj)[names(props.bySubj) == "V1"] <- "Prop"
print(props.bySubj)
Subject NativeLang PrevType Prop
1 A1 English nonword 0.9767
2 A1 English word 1.0000
3 A2 English nonword 0.9487
4 A2 English word 0.9750
5 C English nonword 1.0000
6 C English word 0.9722
7 J Other nonword 0.8667
8 J Other word 0.8235
9 K English nonword 1.0000
10 K English word 0.8537
11 M2 Other nonword 0.9444
12 M2 Other word 0.9535
13 R3 English nonword 0.9750
14 R3 English word 0.9744
15 S English nonword 0.9750
16 S English word 1.0000
17 T1 English nonword 0.9487
18 T1 English word 0.9750
19 T2 Other nonword 0.9048
20 T2 Other word 0.9189
21 V Other nonword 0.9773
22 V Other word 0.9429
23 Z Other nonword 0.9111
24 Z Other word 0.9118
ggplot(data = props.bySubj, aes(x = PrevType, y = Prop, colour = NativeLang)) +
stat_summary(fun.data = "mean_cl_boot", size = 1.4) +
my.theme
ggplot(data = props.bySubj, aes(x = PrevType, y = Prop, fill = NativeLang)) +
stat_summary(position=position_dodge(.9), fun.y=mean, geom="bar") +
stat_summary(position=position_dodge(.9), fun.data = "mean_cl_boot", geom="errorbar", width=.25, colour="black") +
my.theme
# calculate grand mean and standard error by native language and previous word
# type
grand.props <- ddply(props.bySubj, .(NativeLang, PrevType), summarise, mean.prop = mean(Prop),
se = std.error(Prop))
print(grand.props)
NativeLang PrevType mean.prop se
1 English nonword 0.9749 0.00792
2 English word 0.9643 0.01901
3 Other nonword 0.9209 0.01875
4 Other word 0.9101 0.02295
ggplot(data = grand.props, aes(x = PrevType, y = mean.prop, fill = NativeLang)) +
geom_bar(position = position_dodge(.9)) +
geom_errorbar(position = position_dodge(.9), width = .25,
aes(ymin = mean.prop-se, ymax = mean.prop+se)) +
my.theme
Linguistic Data
Part 4: Geospatial Plotting and Possibilities for Sociolinguistics and Dialectology
# install.packages("maps")
require(maps)
# collect longitude and latitude coordinates for US states
states <- map_data("state")
head(states)
long lat group order region subregion
1 -87.46 30.39 1 1 alabama <NA>
2 -87.48 30.37 1 2 alabama <NA>
3 -87.53 30.37 1 3 alabama <NA>
4 -87.53 30.33 1 4 alabama <NA>
5 -87.57 30.33 1 5 alabama <NA>
6 -87.59 30.33 1 6 alabama <NA>
ggplot(data=states, aes(map_id = region, group="group")) +
geom_map(map = states) +
expand_limits(x = states$long, y = states$lat)
ggplot(data=states, aes(map_id = region, group="group")) +
# colour affects border, fill affects state regions
geom_map(color="#757575", fill="lightgray", map = states) +
expand_limits(x = states$long, y = states$lat) +
coord_map()
geo.dat <- read.delim("datasets/fakedata_geo.txt", header=T)
head(geo.dat)
lat long region group
1 41.6 -80.9 ohio B
2 42.1 -83.9 ohio D
3 39.7 -84.2 ohio A
4 40.6 -82.4 ohio B
5 39.7 -82.3 ohio D
6 41.9 -82.5 ohio A
str(geo.dat)
'data.frame': 323 obs. of 4 variables:
$ lat : num 41.6 42.1 39.7 40.6 39.7 41.9 41.4 38.6 42.3 39.5 ...
$ long : num -80.9 -83.9 -84.2 -82.4 -82.3 -82.5 -81.7 -84.1 -82.7 -82.2 ...
$ region: Factor w/ 10 levels "georgia","idaho",..: 6 6 6 6 6 6 6 6 6 6 ...
$ group : Factor w/ 5 levels "A","B","C","D",..: 2 4 1 2 4 1 1 1 1 2 ...
ggplot(data=states, aes(map_id = region, group="group")) +
geom_map(color="#757575", fill="lightgray", map = states) +
expand_limits(x = states$long, y = states$lat) +
coord_map() +
geom_point(data = geo.dat, aes(x=long, y=lat, colour=group)) # HERE'S THE MAGIC!
ggplot(data=states, aes(map_id = region, group="group")) +
geom_map(color="#757575", fill="lightgray", map = states) +
expand_limits(x = states$long, y = states$lat) +
coord_map() +
geom_point(data = geo.dat, aes(x=long, y=lat, colour=group)) +
theme(panel.background = element_blank(), # remove panel background
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_blank())
Thanks to: