From 11aab085d656b79482e92a05acbbac81125bfb78 Mon Sep 17 00:00:00 2001 From: Kristin Linn Date: Tue, 20 Oct 2015 16:22:40 -0400 Subject: add statistical analysis section with general linear models --- r.html.markdown | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/r.html.markdown b/r.html.markdown index d3d725d3..3d0b9b9e 100644 --- a/r.html.markdown +++ b/r.html.markdown @@ -3,6 +3,7 @@ language: R contributors: - ["e99n09", "http://github.com/e99n09"] - ["isomorphismes", "http://twitter.com/isomorphisms"] + - ["kalinn", "http://github.com/kalinn"] filename: learnr.r --- @@ -196,6 +197,14 @@ class(NaN) # "numeric" # You can do arithmetic on two vectors with length greater than 1, # so long as the larger vector's length is an integer multiple of the smaller c(1,2,3) + c(1,2,3) # 2 4 6 +# Since a single number is a vector of length one, scalars are applied +# elementwise to vectors +(4 * c(1,2,3) - 2) / 2 # 1 3 5 +# Except for scalars, use caution when performing arithmetic on vectors with +# different lengths. Although it can be done, +c(1,2,3,1,2,3) * c(1,2) # 1 4 3 2 2 6 +# Matching lengths is better practice and easier to read +c(1,2,3,1,2,3) * c(1,2,1,2,1,2) # CHARACTERS # There's no difference between strings and characters in R @@ -234,6 +243,9 @@ class(NA) # "logical" TRUE | FALSE # TRUE # AND TRUE & FALSE # FALSE +# Applying | and & to vectors returns elementwise logic operations +c(TRUE,FALSE,FALSE) | c(FALSE,TRUE,FALSE) # TRUE TRUE FALSE +c(TRUE,FALSE,TRUE) & c(FALSE,TRUE,TRUE) # FALSE FALSE TRUE # You can test if x is TRUE isTRUE(TRUE) # TRUE # Here we get a logical vector with many elements: @@ -663,6 +675,95 @@ write.csv(pets, "pets2.csv") # to make a new .csv file +######################### +# Statistical Analysis +######################### + +# Linear regression! +linearModel <- lm(price ~ time, data = list1) +linearModel # outputs result of regression +# => +# Call: +# lm(formula = price ~ time, data = list1) +# +# Coefficients: +# (Intercept) time +# 0.1453 0.4943 +summary(linearModel) # more verbose output from the regression +# => +# Call: +# lm(formula = price ~ time, data = list1) +# +# Residuals: +# Min 1Q Median 3Q Max +# -8.3134 -3.0131 -0.3606 2.8016 10.3992 +# +# Coefficients: +# Estimate Std. Error t value Pr(>|t|) +# (Intercept) 0.14527 1.50084 0.097 0.923 +# time 0.49435 0.06379 7.749 2.44e-09 *** +# --- +# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +# +# Residual standard error: 4.657 on 38 degrees of freedom +# Multiple R-squared: 0.6124, Adjusted R-squared: 0.6022 +# F-statistic: 60.05 on 1 and 38 DF, p-value: 2.44e-09 +coef(linearModel) # extract estimated parameters +# => +# (Intercept) time +# 0.1452662 0.4943490 +summary(linearModel)$coefficients # another way to extract results +# => +# Estimate Std. Error t value Pr(>|t|) +# (Intercept) 0.1452662 1.50084246 0.09678975 9.234021e-01 +# time 0.4943490 0.06379348 7.74920901 2.440008e-09 +summary(linearModel)$coefficients[,4] # the p-values +# => +# (Intercept) time +# 9.234021e-01 2.440008e-09 + +# GENERAL LINEAR MODELS +# Logistic regression +set.seed(1) +list1$success = rbinom(length(list1$time), 1, .5) # random binary +glModel <- glm(success ~ time, data = list1, + family=binomial(link="logit")) +glModel # outputs result of logistic regression +# => +# Call: glm(formula = success ~ time, +# family = binomial(link = "logit"), data = list1) +# +# Coefficients: +# (Intercept) time +# 0.17018 -0.01321 +# +# Degrees of Freedom: 39 Total (i.e. Null); 38 Residual +# Null Deviance: 55.35 +# Residual Deviance: 55.12 AIC: 59.12 +summary(glModel) # more verbose output from the regression +# => +# Call: +# glm(formula = success ~ time, +# family = binomial(link = "logit"), data = list1) + +# Deviance Residuals: +# Min 1Q Median 3Q Max +# -1.245 -1.118 -1.035 1.202 1.327 +# +# Coefficients: +# Estimate Std. Error z value Pr(>|z|) +# (Intercept) 0.17018 0.64621 0.263 0.792 +# time -0.01321 0.02757 -0.479 0.632 +# +# (Dispersion parameter for binomial family taken to be 1) +# +# Null deviance: 55.352 on 39 degrees of freedom +# Residual deviance: 55.121 on 38 degrees of freedom +# AIC: 59.121 +# +# Number of Fisher Scoring iterations: 3 + + ######################### # Plots ######################### @@ -670,9 +771,6 @@ write.csv(pets, "pets2.csv") # to make a new .csv file # BUILT-IN PLOTTING FUNCTIONS # Scatterplots! plot(list1$time, list1$price, main = "fake data") -# Regressions! -linearModel <- lm(price ~ time, data = list1) -linearModel # outputs result of regression # Plot regression line on existing plot abline(linearModel, col = "red") # Get a variety of nice diagnostics @@ -696,7 +794,6 @@ pp + geom_point() # ggplot2 has excellent documentation (available http://docs.ggplot2.org/current/) - ``` ## How do I get R? -- cgit v1.2.3 From 622e03a141f586e858209fe98c649aa2a4bb9183 Mon Sep 17 00:00:00 2001 From: Kristin Linn Date: Tue, 20 Oct 2015 16:57:36 -0400 Subject: add statistical analysis section with general linear models --- r.html.markdown | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/r.html.markdown b/r.html.markdown index 61fc7a01..3d0b9b9e 100644 --- a/r.html.markdown +++ b/r.html.markdown @@ -16,8 +16,7 @@ R is a statistical computing language. It has lots of libraries for uploading an # You can't make multi-line comments, # but you can stack multiple comments like so. -# in Windows you can use CTRL-ENTER to execute a line. -# on Mac it is COMMAND-ENTER +# in Windows or Mac, hit COMMAND-ENTER to execute a line @@ -38,8 +37,8 @@ head(rivers) # peek at the data set length(rivers) # how many rivers were measured? # 141 summary(rivers) # what are some summary statistics? -# Min. 1st Qu. Median Mean 3rd Qu. Max. -# 135.0 310.0 425.0 591.2 680.0 3710.0 +# Min. 1st Qu. Median Mean 3rd Qu. Max. +# 135.0 310.0 425.0 591.2 680.0 3710.0 # make a stem-and-leaf plot (a histogram-like data visualization) stem(rivers) @@ -56,14 +55,14 @@ stem(rivers) # 14 | 56 # 16 | 7 # 18 | 9 -# 20 | +# 20 | # 22 | 25 # 24 | 3 -# 26 | -# 28 | -# 30 | -# 32 | -# 34 | +# 26 | +# 28 | +# 30 | +# 32 | +# 34 | # 36 | 1 stem(log(rivers)) # Notice that the data are neither normal nor log-normal! @@ -72,7 +71,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal! # The decimal point is 1 digit(s) to the left of the | # # 48 | 1 -# 50 | +# 50 | # 52 | 15578 # 54 | 44571222466689 # 56 | 023334677000124455789 @@ -87,7 +86,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal! # 74 | 84 # 76 | 56 # 78 | 4 -# 80 | +# 80 | # 82 | 2 # make a histogram: @@ -110,7 +109,7 @@ sort(discoveries) # [76] 4 4 4 4 5 5 5 5 5 5 5 6 6 6 6 6 6 7 7 7 7 8 9 10 12 stem(discoveries, scale=2) -# +# # The decimal point is at the | # # 0 | 000000000 @@ -124,14 +123,14 @@ stem(discoveries, scale=2) # 8 | 0 # 9 | 0 # 10 | 0 -# 11 | +# 11 | # 12 | 0 max(discoveries) # 12 summary(discoveries) -# Min. 1st Qu. Median Mean 3rd Qu. Max. -# 0.0 2.0 3.0 3.1 4.0 12.0 +# Min. 1st Qu. Median Mean 3rd Qu. Max. +# 0.0 2.0 3.0 3.1 4.0 12.0 # Roll a die a few times round(runif(7, min=.5, max=6.5)) @@ -275,7 +274,7 @@ class(NULL) # NULL parakeet = c("beak", "feathers", "wings", "eyes") parakeet # => -# [1] "beak" "feathers" "wings" "eyes" +# [1] "beak" "feathers" "wings" "eyes" parakeet <- NULL parakeet # => @@ -292,7 +291,7 @@ as.numeric("Bilbo") # => # [1] NA # Warning message: -# NAs introduced by coercion +# NAs introduced by coercion # Also note: those were just the basic data types # There are many more data types, such as for dates, time series, etc. @@ -432,10 +431,10 @@ mat %*% t(mat) mat2 <- cbind(1:4, c("dog", "cat", "bird", "dog")) mat2 # => -# [,1] [,2] -# [1,] "1" "dog" -# [2,] "2" "cat" -# [3,] "3" "bird" +# [,1] [,2] +# [1,] "1" "dog" +# [2,] "2" "cat" +# [3,] "3" "bird" # [4,] "4" "dog" class(mat2) # matrix # Again, note what happened! -- cgit v1.2.3 From 81c1b8334cdccd054d4131fc0309eeebebef53f9 Mon Sep 17 00:00:00 2001 From: Kristin Linn Date: Tue, 20 Oct 2015 17:06:41 -0400 Subject: fix spaces at end-of-lines --- r.html.markdown | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/r.html.markdown b/r.html.markdown index 3d0b9b9e..ce313ecc 100644 --- a/r.html.markdown +++ b/r.html.markdown @@ -16,9 +16,8 @@ R is a statistical computing language. It has lots of libraries for uploading an # You can't make multi-line comments, # but you can stack multiple comments like so. -# in Windows or Mac, hit COMMAND-ENTER to execute a line - - +# in Windows you can use CTRL-ENTER to execute a line. +# on Mac it is COMMAND-ENTER ############################################################################# # Stuff you can do without understanding anything about programming @@ -37,8 +36,8 @@ head(rivers) # peek at the data set length(rivers) # how many rivers were measured? # 141 summary(rivers) # what are some summary statistics? -# Min. 1st Qu. Median Mean 3rd Qu. Max. -# 135.0 310.0 425.0 591.2 680.0 3710.0 +# Min. 1st Qu. Median Mean 3rd Qu. Max. +# 135.0 310.0 425.0 591.2 680.0 3710.0 # make a stem-and-leaf plot (a histogram-like data visualization) stem(rivers) @@ -55,14 +54,14 @@ stem(rivers) # 14 | 56 # 16 | 7 # 18 | 9 -# 20 | +# 20 | # 22 | 25 # 24 | 3 -# 26 | -# 28 | -# 30 | -# 32 | -# 34 | +# 26 | +# 28 | +# 30 | +# 32 | +# 34 | # 36 | 1 stem(log(rivers)) # Notice that the data are neither normal nor log-normal! @@ -71,7 +70,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal! # The decimal point is 1 digit(s) to the left of the | # # 48 | 1 -# 50 | +# 50 | # 52 | 15578 # 54 | 44571222466689 # 56 | 023334677000124455789 @@ -86,7 +85,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal! # 74 | 84 # 76 | 56 # 78 | 4 -# 80 | +# 80 | # 82 | 2 # make a histogram: @@ -109,7 +108,7 @@ sort(discoveries) # [76] 4 4 4 4 5 5 5 5 5 5 5 6 6 6 6 6 6 7 7 7 7 8 9 10 12 stem(discoveries, scale=2) -# +# # The decimal point is at the | # # 0 | 000000000 @@ -130,7 +129,7 @@ max(discoveries) # 12 summary(discoveries) # Min. 1st Qu. Median Mean 3rd Qu. Max. -# 0.0 2.0 3.0 3.1 4.0 12.0 +# 0.0 2.0 3.0 3.1 4.0 12.0 # Roll a die a few times round(runif(7, min=.5, max=6.5)) @@ -274,7 +273,7 @@ class(NULL) # NULL parakeet = c("beak", "feathers", "wings", "eyes") parakeet # => -# [1] "beak" "feathers" "wings" "eyes" +# [1] "beak" "feathers" "wings" "eyes" parakeet <- NULL parakeet # => @@ -291,7 +290,7 @@ as.numeric("Bilbo") # => # [1] NA # Warning message: -# NAs introduced by coercion +# NAs introduced by coercion # Also note: those were just the basic data types # There are many more data types, such as for dates, time series, etc. @@ -431,10 +430,10 @@ mat %*% t(mat) mat2 <- cbind(1:4, c("dog", "cat", "bird", "dog")) mat2 # => -# [,1] [,2] -# [1,] "1" "dog" -# [2,] "2" "cat" -# [3,] "3" "bird" +# [,1] [,2] +# [1,] "1" "dog" +# [2,] "2" "cat" +# [3,] "3" "bird" # [4,] "4" "dog" class(mat2) # matrix # Again, note what happened! -- cgit v1.2.3 From 7ad97c290436d9f01ba9b5dd2a557869995efa0c Mon Sep 17 00:00:00 2001 From: Kristin Linn Date: Tue, 20 Oct 2015 17:10:58 -0400 Subject: fix spaces at end-of-lines again --- r.html.markdown | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/r.html.markdown b/r.html.markdown index ce313ecc..8539b10e 100644 --- a/r.html.markdown +++ b/r.html.markdown @@ -19,6 +19,8 @@ R is a statistical computing language. It has lots of libraries for uploading an # in Windows you can use CTRL-ENTER to execute a line. # on Mac it is COMMAND-ENTER + + ############################################################################# # Stuff you can do without understanding anything about programming ############################################################################# @@ -122,13 +124,13 @@ stem(discoveries, scale=2) # 8 | 0 # 9 | 0 # 10 | 0 -# 11 | +# 11 | # 12 | 0 max(discoveries) # 12 summary(discoveries) -# Min. 1st Qu. Median Mean 3rd Qu. Max. +# Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.0 2.0 3.0 3.1 4.0 12.0 # Roll a die a few times @@ -793,6 +795,7 @@ pp + geom_point() # ggplot2 has excellent documentation (available http://docs.ggplot2.org/current/) + ``` ## How do I get R? -- cgit v1.2.3