summaryrefslogtreecommitdiffhomepage
path: root/r.html.markdown
blob: 2cf632886d629b2d95c11405688855295ebdaa99 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
---
language: R
author: e99n09
author_url: http://github.com/e99n09
filename: learnr.r
---

R is a statistical computing language. It has lots of good built-in functions for uploading and cleaning data sets, running common statistical tests, and making graphs. You can also easily compile it within a LaTeX document.

```python

# Comments start with hashtags.

# You can't make a multi-line comment per se,
# but you can stack multiple comments like so.

# Hit COMMAND-ENTER to execute a line

#########################
# The absolute basics
#########################

# NUMBERS

# We've got doubles! Behold the "numeric" class
5 # => [1] 5
class(5) # => [1] "numeric"
# We've also got integers! They look suspiciously similar,
# but indeed are different
5L # => [1] 5
class(5L) # => [1] "integer"
# Try ?class for more information on the class() function
# In fact, you can look up the documentation on just about anything with ?

# All the normal operations!
10 + 66 # => [1] 76
53.2 - 4 # => [1] 49.2
2 * 2.0 # => [1] 4
3L / 4 # => [1] 0.75
3 %% 2 # => [1] 1

# Finally, we've got not-a-numbers! They're numerics too
class(NaN) # => [1] "numeric"

# CHARACTERS

# We've (sort of) got strings! Behold the "character" class
"plugh" # => [1] "plugh"
class("plugh") # "character"
# There's no difference between strings and characters in R

# LOGICALS

# We've got booleans! Behold the "logical" class
class(TRUE) # => [1] "logical"
class(FALSE) # => [1] "logical"
# Behavior is normal
TRUE == TRUE # => [1] TRUE
TRUE == FALSE # => [1] FALSE
FALSE != FALSE # => [1] FALSE
FALSE != TRUE # => [1] TRUE
# Missing data (NA) is logical, too
class(NA) # => [1] "logical"

# FACTORS

# The factor class is for categorical data
# It has an attribute called levels that describes all the possible categories
factor("dog")
# =>
# [1] dog
# Levels: dog
# (This will make more sense once we start talking about vectors)

# VARIABLES

# Lots of way to assign stuff
x = 5 # this is possible
y <- "1" # this is preferred
TRUE -> z # this works but is weird

# We can use coerce variables to different classes
as.numeric(y) # => [1] 1
as.character(x) # => [1] "5"

# LOOPS

# We've got for loops
for (i in 1:4) {
  print(i)
}

# We've got while loops
a <- 10
while (a > 4) {
	cat(a, "...", sep = "")
	a <- a - 1
}

# Keep in mind that for and while loops run slowly in R
# Operations on entire vectors (i.e. a whole row, a whole column)
# or apply()-type functions (we'll discuss later) are preferred

# IF/ELSE

# Again, pretty standard
if (4 > 3) {
	print("Huzzah! It worked!")
} else {
	print("Noooo! This is blatantly illogical!")
}
# =>
# [1] "Huzzah! It worked!"

# FUNCTIONS

# Defined like so:
myFunc <- function(x) {
	x <- x * 4
	x <- x - 1
	return(x)
}

# Called like any other R function:
myFunc(5) # => [1] 19

#########################
# Fun with data: vectors, matrices, data frames, and arrays
#########################

# ONE-DIMENSIONAL

# You can vectorize anything, so long as all components have the same type
vec <- c(8, 9, 10, 11)
vec # => [1]  8  9 10 11
# The class of a vector is the class of its components
class(vec) # => [1] "numeric"
# If you vectorize items of different classes, weird coercions happen
c(TRUE, 4) # => [1] 1 4
c("dog", TRUE, 4) # => [1] "dog"  "TRUE" "4"

# We ask for specific components like so (R starts counting from 1)
vec[1] # => [1] 8
# We can also search for the indices of specific components,
which(vec %% 2 == 0) # => [1] 1 3
# or grab just the first or last entry in the vector
head(vec, 1) # => [1] 8
tail(vec, 1) # => [1] 11
# If an index "goes over" you'll get NA:
vec[6] # => [1] NA
# You can find the length of your vector with length()
length(vec) # => [1] 4

# You can perform operations on entire vectors or subsets of vectors
vec * 4 # => [1] 16 20 24 28
vec[2:3] * 5 # => [1] 25 30
# and there are many built-in functions to summarize vectors
mean(vec) # => [1] 9.5
var(vec) # => [1] 1.666667
sd(vec) # => [1] 1.290994
max(vec) # => [1] 11
min(vec) # => [1] 8
sum(vec) # => [1] 38

# TWO-DIMENSIONAL (ALL ONE CLASS)

# You can make a matrix out of entries all of the same type like so:
mat <- matrix(nrow = 3, ncol = 2, c(1,2,3,4,5,6))
mat
# =>
#      [,1] [,2]
# [1,]    1    4
# [2,]    2    5
# [3,]    3    6
# Unlike a vector, the class of a matrix is "matrix", no matter what's in it
class(mat) # => "matrix"
# Ask for the first row
mat[1,] # => [1] 1 4
# Perform operation on the first column
3 * mat[,1] # => [1] 3 6 9
# Ask for a specific cell
mat[3,2] # => [1] 6
# Transpose the whole matrix
t(mat)
# =>
#      [,1] [,2] [,3]
# [1,]    1    2    3
# [2,]    4    5    6

# cbind() sticks vectors together column-wise to make a matrix
mat2 <- cbind(1:4, c("dog", "cat", "bird", "dog"))
mat2
# =>
#      [,1] [,2]   
# [1,] "1"  "dog"  
# [2,] "2"  "cat"  
# [3,] "3"  "bird" 
# [4,] "4"  "dog"
class(mat2) # => [1] matrix
# Again, note what happened!
# Because matrices must contain entries all of the same class,
# everything got converted to the character class
c(class(mat2[,1]), class(mat2[,2]))

# rbind() sticks vectors together row-wise to make a matrix
mat3 <- rbind(c(1,2,4,5), c(6,7,0,4))
mat3
# =>
#      [,1] [,2] [,3] [,4]
# [1,]    1    2    4    5
# [2,]    6    7    0    4
# Aah, everything of the same class. No coercions. Much better.

# TWO-DIMENSIONAL (DIFFERENT CLASSES)

# For columns of different classes, use the data frame
dat <- data.frame(c(5,2,1,4), c("dog", "cat", "bird", "dog"))
names(dat) <- c("number", "species") # name the columns
class(dat) # => [1] "data.frame"
dat
# =>
#   number species
# 1      5     dog
# 2      2     cat
# 3      1    bird
# 4      4     dog
class(dat$number) # => [1] "numeric"
class(dat[,2]) # => [1] "factor"
# The data.frame() function converts character vectors to factor vectors

# There are many twisty ways to subset data frames, all subtly unalike
dat$number # => [1] 5 2 1 4
dat[,1] # => [1] 5 2 1 4
dat[,"number"] # => [1] 5 2 1 4

# MULTI-DIMENSIONAL (ALL OF ONE CLASS)

# Arrays creates n-dimensional tables
# You can make a two-dimensional table (sort of like a matrix)
array(c(c(1,2,4,5),c(8,9,3,6)), dim=c(2,4))
# =>
#      [,1] [,2] [,3] [,4]
# [1,]    1    4    8    3
# [2,]    2    5    9    6
# You can use array to make three-dimensional matrices too
array(c(c(c(2,300,4),c(8,9,0)),c(c(5,60,0),c(66,7,847))), dim=c(3,2,2))
# =>
# , , 1
#
#     [,1] [,2]
# [1,]    1    4
# [2,]    2    5
#
# , , 2
#
#      [,1] [,2]
# [1,]    8    1
# [2,]    9    2

# LISTS (MULTI-DIMENSIONAL, POSSIBLY RAGGED, OF DIFFERENT TYPES)

# Finally, R has lists (of vectors)
list1 <- list(time = 1:40, price = c(rnorm(40,.5*list1$time,4))) # random
list1

# You can get items in the list like so
list1$time
# You can subset list items like vectors
list1$price[4]

#########################
# The apply() family of functions
#########################

# Remember mat?
mat
# =>
#      [,1] [,2]
# [1,]    1    4
# [2,]    2    5
# [3,]    3    6
# Use apply(X, MARGIN, FUN) to apply function FUN to a matrix X
# over rows (MAR = 1) or columns (MAR = 2)
# That is, R does FUN to each row (or column) of X, much faster than a
# for or while loop would do
apply(mat, MAR = 2, myFunc)
# =>
#      [,1] [,2]
# [1,]    3   15
# [2,]    7   19
# [3,]   11   23
# Other functions: ?lapply, ?sapply

# Don't feel too intimidated; everyone agrees they are rather confusing

# The plyr package aims to replace (and improve upon!) the *apply() family.

install.packages("plyr")
require(plyr)
?plyr

#########################
# Loading data
#########################

# "pets.csv" is a file on the internet
pets <- read.csv("http://learnxinyminutes.com/docs/pets.csv")
pets
head(pets, 2) # first two rows
tail(pets, 1) # last row

# To save a data frame or matrix as a .csv file
write.csv(pets, "pets2.csv") # to make a new .csv file
# set working directory with setwd(), look it up with getwd()

# Try ?read.csv and ?write.csv for more information

#########################
# Plots
#########################

# Scatterplots!
plot(list1$time, list1$price, main = "fake data")
# Regressions!
linearModel <- lm(price  ~ time, data = list1)
linearModel # outputs result of regression
# Plot regression line on existing plot
abline(linearModel, col = "red")
# Get a variety of nice diagnostics
plot(linearModel)

# Histograms!
hist(rpois(n = 10000, lambda = 5), col = "thistle")

# Barplots!
barplot(c(1,4,5,1,2), names.arg = c("red","blue","purple","green","yellow"))

# Try the ggplot2 package for more and better graphics

install.packages("ggplot2")
require(ggplot2)
?ggplot2

```

## How do I get R?

* Get R and the R GUI from [http://www.r-project.org/](http://www.r-project.org/)
* [RStudio](http://www.rstudio.com/ide/) is another GUI