diff options
| author | i <isomorphisms@sdf.org> | 2013-08-08 17:50:52 -0400 | 
|---|---|---|
| committer | i <isomorphisms@sdf.org> | 2013-08-08 17:50:52 -0400 | 
| commit | ee1b3546ad1a1a0601f2dc413d0b96f345c27ad9 (patch) | |
| tree | 25e5f5d7e849d4d97326b3d6399555adcb49a26b | |
| parent | 29d2880c6177ff243e6f2413b5f17e9c7fe73f3f (diff) | |
Update r.html.markdown
significant changes. style changes (no !, no =>). content additions. start by showing off R's non-programming features before getting to the language per se.
| -rw-r--r-- | r.html.markdown | 311 | 
1 files changed, 246 insertions, 65 deletions
| diff --git a/r.html.markdown b/r.html.markdown index 0240e8fb..61140be5 100644 --- a/r.html.markdown +++ b/r.html.markdown @@ -16,61 +16,242 @@ R is a statistical computing language. It has lots of good built-in functions fo  # Hit COMMAND-ENTER to execute a line + +################################################################### +# Stuff you can do without understanding anything about programming +################################################################### + +data()	# Browse pre-loaded data sets +data(rivers)	# Lengths of Major North American Rivers +ls()	# Notice that "rivers" appears in the workspace +head(rivers)	# peek at the dataset +# 735 320 325 392 524 450 +length(rivers)	# how many rivers were measured? +# 141 +summary(rivers) +#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +#  135.0   310.0   425.0   591.2   680.0  3710.0  +stem(rivers)	#stem-and-leaf plot (like a histogram) +# +#  The decimal point is 2 digit(s) to the right of the | +# +#   0 | 4 +#   2 | 011223334555566667778888899900001111223333344455555666688888999 +#   4 | 111222333445566779001233344567 +#   6 | 000112233578012234468 +#   8 | 045790018 +#  10 | 04507 +#  12 | 1471 +#  14 | 56 +#  16 | 7 +#  18 | 9 +#  20 |  +#  22 | 25 +#  24 | 3 +#  26 |  +#  28 |  +#  30 |  +#  32 |  +#  34 |  +#  36 | 1 + + +stem(log(rivers))	#Notice that the data are neither normal nor log-normal! Take that, Bell Curve fundamentalists. + +#  The decimal point is 1 digit(s) to the left of the | +# +#  48 | 1 +#  50 |  +#  52 | 15578 +#  54 | 44571222466689 +#  56 | 023334677000124455789 +#  58 | 00122366666999933445777 +#  60 | 122445567800133459 +#  62 | 112666799035 +#  64 | 00011334581257889 +#  66 | 003683579 +#  68 | 0019156 +#  70 | 079357 +#  72 | 89 +#  74 | 84 +#  76 | 56 +#  78 | 4 +#  80 |  +#  82 | 2 + + +hist(rivers, col="#333333", border="white", breaks=25)	#play around with these parameters +hist(log(rivers), col="#333333", border="white", breaks=25)	#you'll do more plotting later + +#Here's another neat data set that comes pre-loaded. R has tons of these. data() +data(discoveries) +plot(discoveries, col="#333333", lwd=3, xlab="Year", main="Number of important discoveries per year") +plot(discoveries, col="#333333", lwd=3, type = "h", xlab="Year", main="Number of important discoveries per year") + + +#rather than leaving the default ordering (by year) we could also sort to see what's typical +sort(discoveries) +#  [1]  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1  1  1  1  1  2  2  2  2 +# [26]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  3  3  3 +# [51]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4 +# [76]  4  4  4  4  5  5  5  5  5  5  5  6  6  6  6  6  6  7  7  7  7  8  9 10 12 + +stem(discoveries, scale=2) +#  +#  The decimal point is at the | +# +#   0 | 000000000 +#   1 | 000000000000 +#   2 | 00000000000000000000000000 +#   3 | 00000000000000000000 +#   4 | 000000000000 +#   5 | 0000000 +#   6 | 000000 +#   7 | 0000 +#   8 | 0 +#   9 | 0 +#  10 | 0 +#  11 |  +#  12 | 0 + +max(discoveries) +# 12 + +summary(discoveries) +#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  +#    0.0     2.0     3.0     3.1     4.0    12.0  + + + + +#Basic statistical operations don't require any programming knowledge either + +#roll a die a few times +round(runif(7, min=.5, max=6.5)) +# 1 4 6 1 4 6 4 + +#your numbers will differ from mine unless we set the same random.seed(31337) + + +#draw from a standard Gaussian 9 times +rnorm(9) +# [1]  0.07528471  1.03499859  1.34809556 -0.82356087  0.61638975 -1.88757271 +# [7] -0.59975593  0.57629164  1.08455362 + + + + + + + + +  ######################### -# The absolute basics +# Basic programming stuff  #########################  # NUMBERS -# We've got doubles! Behold the "numeric" class -5 # => [1] 5 -class(5) # => [1] "numeric" -# We've also got integers! They look suspiciously similar, -# but indeed are different -5L # => [1] 5 -class(5L) # => [1] "integer" +# "numeric" means double-precision floating-point numbers +5	# 5 +class(5)	# "numeric" +5e4	# 50000				#handy when dealing with large,small,or variable orders of magnitude +6.02e23	# Avogadro's number +1.6e-35	# Planck length + +# long-storage integers are written with L +5L	# 5 +class(5L)	# "integer" +  # Try ?class for more information on the class() function -# In fact, you can look up the documentation on just about anything with ? +# In fact, you can look up the documentation on `xyz` with ?xyz +# or see the source for `xyz` by evaluating xyz + +# Arithmetic +10 + 66	# 76 +53.2 - 4	# 49.2 +2 * 2.0	# 4 +3L / 4	# 0.75 +3 %% 2	# 1 + +# Weird number types +class(NaN)	# "numeric" +class(Inf)	# "numeric" +class(-Inf)	# "numeric"		#used in for example integrate( dnorm(x), 3, Inf ) -- which obviates Z-score tables + +# but beware, NaN isn't the only weird type... +class(NA)	# see below +class(NULL)	# NULL + + +# SIMPLE LISTS +c(6, 8, 7, 5, 3, 0, 9)	# 6 8 7 5 3 0 9 +c('alef', 'bet', 'gimmel', 'dalet', 'he')	# "alef"   "bet"    "gimmel" "dalet"  "he" +c('Z', 'o', 'r', 'o') == "Zoro"	# FALSE FALSE FALSE FALSE + +#some more nice built-ins +5:15	# 5  6  7  8  9 10 11 12 13 14 15 + +seq(from=0, to=31337, by=1337) +#  [1]     0  1337  2674  4011  5348  6685  8022  9359 10696 12033 13370 14707 +# [13] 16044 17381 18718 20055 21392 22729 24066 25403 26740 28077 29414 30751 + +letters +#  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" +# [20] "t" "u" "v" "w" "x" "y" "z" + +month.abb	# "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec" + + +# Access the n'th element of a list with list.name[n] or sometimes list.name[[n]] +letters[18]	# "r" +LETTERS[13]	# "M" +month.name[9]	# "September" +c(6, 8, 7, 5, 3, 0, 9)[3]	# 7 -# All the normal operations! -10 + 66 # => [1] 76 -53.2 - 4 # => [1] 49.2 -2 * 2.0 # => [1] 4 -3L / 4 # => [1] 0.75 -3 %% 2 # => [1] 1 -# Finally, we've got not-a-numbers! They're numerics too -class(NaN) # => [1] "numeric"  # CHARACTERS -# We've (sort of) got strings! Behold the "character" class -"plugh" # => [1] "plugh" -class("plugh") # "character"  # There's no difference between strings and characters in R +"Horatio"	# "Horatio" +class("Horatio") # "character" +substr("Fortuna multis dat nimis, nulli satis.", 9, 15)	# "multis " +gsub('u', 'ø', "Fortuna multis dat nimis, nulli satis.")	# "Fortøna møltis dat nimis, nølli satis." + + +  # LOGICALS -# We've got booleans! Behold the "logical" class -class(TRUE) # => [1] "logical" -class(FALSE) # => [1] "logical" +# booleans +class(TRUE)	# "logical" +class(FALSE)	# "logical"  # Behavior is normal -TRUE == TRUE # => [1] TRUE -TRUE == FALSE # => [1] FALSE -FALSE != FALSE # => [1] FALSE -FALSE != TRUE # => [1] TRUE +TRUE == TRUE	# TRUE +TRUE == FALSE	# FALSE +FALSE != FALSE	# FALSE +FALSE != TRUE	# TRUE  # Missing data (NA) is logical, too -class(NA) # => [1] "logical" +class(NA)	# "logical" + +  # FACTORS  # The factor class is for categorical data -# It has an attribute called levels that describes all the possible categories -factor("dog") -# => -# [1] dog -# Levels: dog -# (This will make more sense once we start talking about vectors) +# which can be ordered (like childrens' grade levels) +# or unordered (like gender) +levels(factor(c("female", "male", "male", "female", "NA", "female")))	# "female" "male"   "NA"  + +factor(c("female", "female", "male", "NA", "female")) +#  female female male   NA     female +# Levels: female male NA + +data(infert)	#Infertility after Spontaneous and Induced Abortion +levels(infert$education)	# "0-5yrs"  "6-11yrs" "12+ yrs" + +  # VARIABLES @@ -80,8 +261,8 @@ y <- "1" # this is preferred  TRUE -> z # this works but is weird  # We can use coerce variables to different classes -as.numeric(y) # => [1] 1 -as.character(x) # => [1] "5" +as.numeric(y)	# 1 +as.character(x)	# "5"  # LOOPS @@ -122,7 +303,7 @@ myFunc <- function(x) {  }  # Called like any other R function: -myFunc(5) # => [1] 19 +myFunc(5)	# 19  #########################  # Fun with data: vectors, matrices, data frames, and arrays @@ -132,35 +313,35 @@ myFunc(5) # => [1] 19  # You can vectorize anything, so long as all components have the same type  vec <- c(8, 9, 10, 11) -vec # => [1]  8  9 10 11 +vec	#  8  9 10 11  # The class of a vector is the class of its components -class(vec) # => [1] "numeric" +class(vec)	# "numeric"  # If you vectorize items of different classes, weird coercions happen -c(TRUE, 4) # => [1] 1 4 -c("dog", TRUE, 4) # => [1] "dog"  "TRUE" "4" +c(TRUE, 4)	# 1 4 +c("dog", TRUE, 4)	# "dog"  "TRUE" "4"  # We ask for specific components like so (R starts counting from 1) -vec[1] # => [1] 8 +vec[1]	# 8  # We can also search for the indices of specific components, -which(vec %% 2 == 0) # => [1] 1 3 +which(vec %% 2 == 0)	# 1 3  # or grab just the first or last entry in the vector -head(vec, 1) # => [1] 8 -tail(vec, 1) # => [1] 11 +head(vec, 1)	# 8 +tail(vec, 1)	# 11  # If an index "goes over" you'll get NA: -vec[6] # => [1] NA +vec[6]	# NA  # You can find the length of your vector with length() -length(vec) # => [1] 4 +length(vec)	# 4  # You can perform operations on entire vectors or subsets of vectors -vec * 4 # => [1] 16 20 24 28 -vec[2:3] * 5 # => [1] 25 30 +vec * 4	# 16 20 24 28 +vec[2:3] * 5	# 25 30  # and there are many built-in functions to summarize vectors -mean(vec) # => [1] 9.5 -var(vec) # => [1] 1.666667 -sd(vec) # => [1] 1.290994 -max(vec) # => [1] 11 -min(vec) # => [1] 8 -sum(vec) # => [1] 38 +mean(vec)	# 9.5 +var(vec)	# 1.666667 +sd(vec)	# 1.290994 +max(vec)	# 11 +min(vec)	# 8 +sum(vec)	# 38  # TWO-DIMENSIONAL (ALL ONE CLASS) @@ -175,11 +356,11 @@ mat  # Unlike a vector, the class of a matrix is "matrix", no matter what's in it  class(mat) # => "matrix"  # Ask for the first row -mat[1,] # => [1] 1 4 +mat[1,]	# 1 4  # Perform operation on the first column -3 * mat[,1] # => [1] 3 6 9 +3 * mat[,1]	# 3 6 9  # Ask for a specific cell -mat[3,2] # => [1] 6 +mat[3,2]	# 6  # Transpose the whole matrix  t(mat)  # => @@ -196,7 +377,7 @@ mat2  # [2,] "2"  "cat"    # [3,] "3"  "bird"   # [4,] "4"  "dog" -class(mat2) # => [1] matrix +class(mat2)	# matrix  # Again, note what happened!  # Because matrices must contain entries all of the same class,  # everything got converted to the character class @@ -216,7 +397,7 @@ mat3  # For columns of different classes, use the data frame  dat <- data.frame(c(5,2,1,4), c("dog", "cat", "bird", "dog"))  names(dat) <- c("number", "species") # name the columns -class(dat) # => [1] "data.frame" +class(dat)	# "data.frame"  dat  # =>  #   number species @@ -224,14 +405,14 @@ dat  # 2      2     cat  # 3      1    bird  # 4      4     dog -class(dat$number) # => [1] "numeric" -class(dat[,2]) # => [1] "factor" +class(dat$number)	# "numeric" +class(dat[,2])	# "factor"  # The data.frame() function converts character vectors to factor vectors  # There are many twisty ways to subset data frames, all subtly unalike -dat$number # => [1] 5 2 1 4 -dat[,1] # => [1] 5 2 1 4 -dat[,"number"] # => [1] 5 2 1 4 +dat$number	# 5 2 1 4 +dat[,1]	# 5 2 1 4 +dat[,"number"]	# 5 2 1 4  # MULTI-DIMENSIONAL (ALL OF ONE CLASS) | 
