From 11aab085d656b79482e92a05acbbac81125bfb78 Mon Sep 17 00:00:00 2001
From: Kristin Linn <klinn@upenn.edu>
Date: Tue, 20 Oct 2015 16:22:40 -0400
Subject: add statistical analysis section with general linear models

---
 r.html.markdown | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 101 insertions(+), 4 deletions(-)

(limited to 'r.html.markdown')

diff --git a/r.html.markdown b/r.html.markdown
index d3d725d3..3d0b9b9e 100644
--- a/r.html.markdown
+++ b/r.html.markdown
@@ -3,6 +3,7 @@ language: R
 contributors:
     - ["e99n09", "http://github.com/e99n09"]
     - ["isomorphismes", "http://twitter.com/isomorphisms"]
+    - ["kalinn", "http://github.com/kalinn"]
 filename: learnr.r
 ---
 
@@ -196,6 +197,14 @@ class(NaN) # "numeric"
 # You can do arithmetic on two vectors with length greater than 1,
 # so long as the larger vector's length is an integer multiple of the smaller
 c(1,2,3) + c(1,2,3) # 2 4 6
+# Since a single number is a vector of length one, scalars are applied 
+# elementwise to vectors
+(4 * c(1,2,3) - 2) / 2 # 1 3 5
+# Except for scalars, use caution when performing arithmetic on vectors with 
+# different lengths. Although it can be done, 
+c(1,2,3,1,2,3) * c(1,2) # 1 4 3 2 2 6
+# Matching lengths is better practice and easier to read
+c(1,2,3,1,2,3) * c(1,2,1,2,1,2) 
 
 # CHARACTERS
 # There's no difference between strings and characters in R
@@ -234,6 +243,9 @@ class(NA)	# "logical"
 TRUE | FALSE	# TRUE
 # AND
 TRUE & FALSE	# FALSE
+# Applying | and & to vectors returns elementwise logic operations
+c(TRUE,FALSE,FALSE) | c(FALSE,TRUE,FALSE) # TRUE TRUE FALSE
+c(TRUE,FALSE,TRUE) & c(FALSE,TRUE,TRUE) # FALSE FALSE TRUE
 # You can test if x is TRUE
 isTRUE(TRUE)	# TRUE
 # Here we get a logical vector with many elements:
@@ -663,6 +675,95 @@ write.csv(pets, "pets2.csv") # to make a new .csv file
 
 
+#########################
+# Statistical Analysis
+#########################
+
+# Linear regression!
+linearModel <- lm(price  ~ time, data = list1)
+linearModel # outputs result of regression
+# =>
+# Call:
+# lm(formula = price ~ time, data = list1)
+# 
+# Coefficients:
+# (Intercept)         time  
+#      0.1453       0.4943  
+summary(linearModel) # more verbose output from the regression
+# =>
+# Call:
+# lm(formula = price ~ time, data = list1)
+#
+# Residuals:
+#     Min      1Q  Median      3Q     Max 
+# -8.3134 -3.0131 -0.3606  2.8016 10.3992 
+#
+# Coefficients:
+#             Estimate Std. Error t value Pr(>|t|)    
+# (Intercept)  0.14527    1.50084   0.097    0.923    
+# time         0.49435    0.06379   7.749 2.44e-09 ***
+# ---
+# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+#
+# Residual standard error: 4.657 on 38 degrees of freedom
+# Multiple R-squared:  0.6124,	Adjusted R-squared:  0.6022 
+# F-statistic: 60.05 on 1 and 38 DF,  p-value: 2.44e-09
+coef(linearModel) # extract estimated parameters
+# =>
+# (Intercept)        time 
+#   0.1452662   0.4943490 
+summary(linearModel)$coefficients # another way to extract results
+# =>
+#              Estimate Std. Error    t value     Pr(>|t|)
+# (Intercept) 0.1452662 1.50084246 0.09678975 9.234021e-01
+# time        0.4943490 0.06379348 7.74920901 2.440008e-09
+summary(linearModel)$coefficients[,4] # the p-values 
+# =>
+#  (Intercept)         time 
+# 9.234021e-01 2.440008e-09 
+
+# GENERAL LINEAR MODELS
+# Logistic regression
+set.seed(1)
+list1$success = rbinom(length(list1$time), 1, .5) # random binary
+glModel <- glm(success  ~ time, data = list1, 
+	family=binomial(link="logit"))
+glModel # outputs result of logistic regression
+# =>
+# Call:  glm(formula = success ~ time, 
+#	family = binomial(link = "logit"), data = list1)
+#
+# Coefficients:
+# (Intercept)         time  
+#     0.17018     -0.01321  
+# 
+# Degrees of Freedom: 39 Total (i.e. Null);  38 Residual
+# Null Deviance:	    55.35 
+# Residual Deviance: 55.12 	 AIC: 59.12
+summary(glModel) # more verbose output from the regression
+# =>
+# Call:
+# glm(formula = success ~ time, 
+#	family = binomial(link = "logit"), data = list1)
+
+# Deviance Residuals: 
+#    Min      1Q  Median      3Q     Max  
+# -1.245  -1.118  -1.035   1.202   1.327  
+# 
+# Coefficients:
+#             Estimate Std. Error z value Pr(>|z|)
+# (Intercept)  0.17018    0.64621   0.263    0.792
+# time        -0.01321    0.02757  -0.479    0.632
+# 
+# (Dispersion parameter for binomial family taken to be 1)
+#
+#     Null deviance: 55.352  on 39  degrees of freedom
+# Residual deviance: 55.121  on 38  degrees of freedom
+# AIC: 59.121
+# 
+# Number of Fisher Scoring iterations: 3
+
+
 #########################
 # Plots
 #########################
@@ -670,9 +771,6 @@ write.csv(pets, "pets2.csv") # to make a new .csv file
 # BUILT-IN PLOTTING FUNCTIONS
 # Scatterplots!
 plot(list1$time, list1$price, main = "fake data")
-# Regressions!
-linearModel <- lm(price  ~ time, data = list1)
-linearModel # outputs result of regression
 # Plot regression line on existing plot
 abline(linearModel, col = "red")
 # Get a variety of nice diagnostics
@@ -696,7 +794,6 @@ pp + geom_point()
 # ggplot2 has excellent documentation (available http://docs.ggplot2.org/current/)
 
 
-
 ```
 
 ## How do I get R?
-- 
cgit v1.2.3


From 622e03a141f586e858209fe98c649aa2a4bb9183 Mon Sep 17 00:00:00 2001
From: Kristin Linn <klinn@upenn.edu>
Date: Tue, 20 Oct 2015 16:57:36 -0400
Subject: add statistical analysis section with general linear models

---
 r.html.markdown | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'r.html.markdown')

diff --git a/r.html.markdown b/r.html.markdown
index 61fc7a01..3d0b9b9e 100644
--- a/r.html.markdown
+++ b/r.html.markdown
@@ -16,8 +16,7 @@ R is a statistical computing language. It has lots of libraries for uploading an
 # You can't make multi-line comments,
 # but you can stack multiple comments like so.
 
-# in Windows you can use CTRL-ENTER to execute a line.
-# on Mac it is COMMAND-ENTER
+# in Windows or Mac, hit COMMAND-ENTER to execute a line
 
 
@@ -38,8 +37,8 @@ head(rivers)	# peek at the data set
 length(rivers)	# how many rivers were measured?
 # 141
 summary(rivers) # what are some summary statistics?
-#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
-#  135.0   310.0   425.0   591.2   680.0  3710.0
+#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#  135.0   310.0   425.0   591.2   680.0  3710.0 
 
 # make a stem-and-leaf plot (a histogram-like data visualization)
 stem(rivers)
@@ -56,14 +55,14 @@ stem(rivers)
 #  14 | 56
 #  16 | 7
 #  18 | 9
-#  20 |
+#  20 | 
 #  22 | 25
 #  24 | 3
-#  26 |
-#  28 |
-#  30 |
-#  32 |
-#  34 |
+#  26 | 
+#  28 | 
+#  30 | 
+#  32 | 
+#  34 | 
 #  36 | 1
 
 stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
@@ -72,7 +71,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
 #  The decimal point is 1 digit(s) to the left of the |
 #
 #  48 | 1
-#  50 |
+#  50 | 
 #  52 | 15578
 #  54 | 44571222466689
 #  56 | 023334677000124455789
@@ -87,7 +86,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
 #  74 | 84
 #  76 | 56
 #  78 | 4
-#  80 |
+#  80 | 
 #  82 | 2
 
 # make a histogram:
@@ -110,7 +109,7 @@ sort(discoveries)
 # [76]  4  4  4  4  5  5  5  5  5  5  5  6  6  6  6  6  6  7  7  7  7  8  9 10 12
 
 stem(discoveries, scale=2)
-#
+# 
 #  The decimal point is at the |
 #
 #   0 | 000000000
@@ -124,14 +123,14 @@ stem(discoveries, scale=2)
 #   8 | 0
 #   9 | 0
 #  10 | 0
-#  11 |
+#  11 | 
 #  12 | 0
 
 max(discoveries)
 # 12
 summary(discoveries)
-#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
-#    0.0     2.0     3.0     3.1     4.0    12.0
+#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#    0.0     2.0     3.0     3.1     4.0    12.0 
 
 # Roll a die a few times
 round(runif(7, min=.5, max=6.5))
@@ -275,7 +274,7 @@ class(NULL)	# NULL
 parakeet = c("beak", "feathers", "wings", "eyes")
 parakeet
 # =>
-# [1] "beak"     "feathers" "wings"    "eyes"
+# [1] "beak"     "feathers" "wings"    "eyes"    
 parakeet <- NULL
 parakeet
 # =>
@@ -292,7 +291,7 @@ as.numeric("Bilbo")
 # =>
 # [1] NA
 # Warning message:
-# NAs introduced by coercion
+# NAs introduced by coercion 
 
 # Also note: those were just the basic data types
 # There are many more data types, such as for dates, time series, etc.
@@ -432,10 +431,10 @@ mat %*% t(mat)
 mat2 <- cbind(1:4, c("dog", "cat", "bird", "dog"))
 mat2
 # =>
-#      [,1] [,2]
-# [1,] "1"  "dog"
-# [2,] "2"  "cat"
-# [3,] "3"  "bird"
+#      [,1] [,2]   
+# [1,] "1"  "dog"  
+# [2,] "2"  "cat"  
+# [3,] "3"  "bird" 
 # [4,] "4"  "dog"
 class(mat2)	# matrix
 # Again, note what happened!
-- 
cgit v1.2.3


From 81c1b8334cdccd054d4131fc0309eeebebef53f9 Mon Sep 17 00:00:00 2001
From: Kristin Linn <klinn@upenn.edu>
Date: Tue, 20 Oct 2015 17:06:41 -0400
Subject: fix spaces at end-of-lines

---
 r.html.markdown | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'r.html.markdown')

diff --git a/r.html.markdown b/r.html.markdown
index 3d0b9b9e..ce313ecc 100644
--- a/r.html.markdown
+++ b/r.html.markdown
@@ -16,9 +16,8 @@ R is a statistical computing language. It has lots of libraries for uploading an
 # You can't make multi-line comments,
 # but you can stack multiple comments like so.
 
-# in Windows or Mac, hit COMMAND-ENTER to execute a line
-
-
+# in Windows you can use CTRL-ENTER to execute a line.
+# on Mac it is COMMAND-ENTER
 
 #############################################################################
 # Stuff you can do without understanding anything about programming
@@ -37,8 +36,8 @@ head(rivers)	# peek at the data set
 length(rivers)	# how many rivers were measured?
 # 141
 summary(rivers) # what are some summary statistics?
-#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#  135.0   310.0   425.0   591.2   680.0  3710.0 
+#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
+#  135.0   310.0   425.0   591.2   680.0  3710.0
 
 # make a stem-and-leaf plot (a histogram-like data visualization)
 stem(rivers)
@@ -55,14 +54,14 @@ stem(rivers)
 #  14 | 56
 #  16 | 7
 #  18 | 9
-#  20 | 
+#  20 |
 #  22 | 25
 #  24 | 3
-#  26 | 
-#  28 | 
-#  30 | 
-#  32 | 
-#  34 | 
+#  26 |
+#  28 |
+#  30 |
+#  32 |
+#  34 |
 #  36 | 1
 
 stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
@@ -71,7 +70,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
 #  The decimal point is 1 digit(s) to the left of the |
 #
 #  48 | 1
-#  50 | 
+#  50 |
 #  52 | 15578
 #  54 | 44571222466689
 #  56 | 023334677000124455789
@@ -86,7 +85,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
 #  74 | 84
 #  76 | 56
 #  78 | 4
-#  80 | 
+#  80 |
 #  82 | 2
 
 # make a histogram:
@@ -109,7 +108,7 @@ sort(discoveries)
 # [76]  4  4  4  4  5  5  5  5  5  5  5  6  6  6  6  6  6  7  7  7  7  8  9 10 12
 
 stem(discoveries, scale=2)
-# 
+#
 #  The decimal point is at the |
 #
 #   0 | 000000000
@@ -130,7 +129,7 @@ max(discoveries)
 # 12
 summary(discoveries)
 #   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#    0.0     2.0     3.0     3.1     4.0    12.0 
+#    0.0     2.0     3.0     3.1     4.0    12.0
 
 # Roll a die a few times
 round(runif(7, min=.5, max=6.5))
@@ -274,7 +273,7 @@ class(NULL)	# NULL
 parakeet = c("beak", "feathers", "wings", "eyes")
 parakeet
 # =>
-# [1] "beak"     "feathers" "wings"    "eyes"    
+# [1] "beak"     "feathers" "wings"    "eyes"
 parakeet <- NULL
 parakeet
 # =>
@@ -291,7 +290,7 @@ as.numeric("Bilbo")
 # =>
 # [1] NA
 # Warning message:
-# NAs introduced by coercion 
+# NAs introduced by coercion
 
 # Also note: those were just the basic data types
 # There are many more data types, such as for dates, time series, etc.
@@ -431,10 +430,10 @@ mat %*% t(mat)
 mat2 <- cbind(1:4, c("dog", "cat", "bird", "dog"))
 mat2
 # =>
-#      [,1] [,2]   
-# [1,] "1"  "dog"  
-# [2,] "2"  "cat"  
-# [3,] "3"  "bird" 
+#      [,1] [,2]
+# [1,] "1"  "dog"
+# [2,] "2"  "cat"
+# [3,] "3"  "bird"
 # [4,] "4"  "dog"
 class(mat2)	# matrix
 # Again, note what happened!
-- 
cgit v1.2.3


From 7ad97c290436d9f01ba9b5dd2a557869995efa0c Mon Sep 17 00:00:00 2001
From: Kristin Linn <klinn@upenn.edu>
Date: Tue, 20 Oct 2015 17:10:58 -0400
Subject: fix spaces at end-of-lines again

---
 r.html.markdown | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'r.html.markdown')

diff --git a/r.html.markdown b/r.html.markdown
index ce313ecc..8539b10e 100644
--- a/r.html.markdown
+++ b/r.html.markdown
@@ -19,6 +19,8 @@ R is a statistical computing language. It has lots of libraries for uploading an
 # in Windows you can use CTRL-ENTER to execute a line.
 # on Mac it is COMMAND-ENTER
 
+
+
 #############################################################################
 # Stuff you can do without understanding anything about programming
 #############################################################################
@@ -122,13 +124,13 @@ stem(discoveries, scale=2)
 #   8 | 0
 #   9 | 0
 #  10 | 0
-#  11 | 
+#  11 |
 #  12 | 0
 
 max(discoveries)
 # 12
 summary(discoveries)
-#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 #    0.0     2.0     3.0     3.1     4.0    12.0
 
 # Roll a die a few times
@@ -793,6 +795,7 @@ pp + geom_point()
 # ggplot2 has excellent documentation (available http://docs.ggplot2.org/current/)
 
 
+
 ```
 
 ## How do I get R?
-- 
cgit v1.2.3