summaryrefslogtreecommitdiffhomepage
path: root/r.html.markdown
diff options
context:
space:
mode:
authorC. Bess <cbess@company.com>2015-11-09 17:55:53 -0600
committerC. Bess <cbess@company.com>2015-11-09 17:55:53 -0600
commitdf0992d72c2a28f140e6ff9681c505f36e19249a (patch)
tree508aa3abe4c25b957dca442560d9c95c9b1fc97a /r.html.markdown
parentafc5ea14654e0e9cd11c7ef1b672639d12418bad (diff)
parentc460e1fafa0e9b4edc6a5cb35b970bb5cc030a81 (diff)
Merge remote-tracking branch 'adambard/master'
Conflicts: swift.html.markdown
Diffstat (limited to 'r.html.markdown')
-rw-r--r--r.html.markdown155
1 files changed, 131 insertions, 24 deletions
diff --git a/r.html.markdown b/r.html.markdown
index c555d748..8539b10e 100644
--- a/r.html.markdown
+++ b/r.html.markdown
@@ -3,6 +3,7 @@ language: R
contributors:
- ["e99n09", "http://github.com/e99n09"]
- ["isomorphismes", "http://twitter.com/isomorphisms"]
+ - ["kalinn", "http://github.com/kalinn"]
filename: learnr.r
---
@@ -15,7 +16,8 @@ R is a statistical computing language. It has lots of libraries for uploading an
# You can't make multi-line comments,
# but you can stack multiple comments like so.
-# in Windows or Mac, hit COMMAND-ENTER to execute a line
+# in Windows you can use CTRL-ENTER to execute a line.
+# on Mac it is COMMAND-ENTER
@@ -36,8 +38,8 @@ head(rivers) # peek at the data set
length(rivers) # how many rivers were measured?
# 141
summary(rivers) # what are some summary statistics?
-# Min. 1st Qu. Median Mean 3rd Qu. Max.
-# 135.0 310.0 425.0 591.2 680.0 3710.0
+# Min. 1st Qu. Median Mean 3rd Qu. Max.
+# 135.0 310.0 425.0 591.2 680.0 3710.0
# make a stem-and-leaf plot (a histogram-like data visualization)
stem(rivers)
@@ -54,14 +56,14 @@ stem(rivers)
# 14 | 56
# 16 | 7
# 18 | 9
-# 20 |
+# 20 |
# 22 | 25
# 24 | 3
-# 26 |
-# 28 |
-# 30 |
-# 32 |
-# 34 |
+# 26 |
+# 28 |
+# 30 |
+# 32 |
+# 34 |
# 36 | 1
stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
@@ -70,7 +72,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
# The decimal point is 1 digit(s) to the left of the |
#
# 48 | 1
-# 50 |
+# 50 |
# 52 | 15578
# 54 | 44571222466689
# 56 | 023334677000124455789
@@ -85,7 +87,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal!
# 74 | 84
# 76 | 56
# 78 | 4
-# 80 |
+# 80 |
# 82 | 2
# make a histogram:
@@ -108,7 +110,7 @@ sort(discoveries)
# [76] 4 4 4 4 5 5 5 5 5 5 5 6 6 6 6 6 6 7 7 7 7 8 9 10 12
stem(discoveries, scale=2)
-#
+#
# The decimal point is at the |
#
# 0 | 000000000
@@ -122,14 +124,14 @@ stem(discoveries, scale=2)
# 8 | 0
# 9 | 0
# 10 | 0
-# 11 |
+# 11 |
# 12 | 0
max(discoveries)
# 12
summary(discoveries)
-# Min. 1st Qu. Median Mean 3rd Qu. Max.
-# 0.0 2.0 3.0 3.1 4.0 12.0
+# Min. 1st Qu. Median Mean 3rd Qu. Max.
+# 0.0 2.0 3.0 3.1 4.0 12.0
# Roll a die a few times
round(runif(7, min=.5, max=6.5))
@@ -196,6 +198,14 @@ class(NaN) # "numeric"
# You can do arithmetic on two vectors with length greater than 1,
# so long as the larger vector's length is an integer multiple of the smaller
c(1,2,3) + c(1,2,3) # 2 4 6
+# Since a single number is a vector of length one, scalars are applied
+# elementwise to vectors
+(4 * c(1,2,3) - 2) / 2 # 1 3 5
+# Except for scalars, use caution when performing arithmetic on vectors with
+# different lengths. Although it can be done,
+c(1,2,3,1,2,3) * c(1,2) # 1 4 3 2 2 6
+# Matching lengths is better practice and easier to read
+c(1,2,3,1,2,3) * c(1,2,1,2,1,2)
# CHARACTERS
# There's no difference between strings and characters in R
@@ -229,6 +239,16 @@ FALSE != FALSE # FALSE
FALSE != TRUE # TRUE
# Missing data (NA) is logical, too
class(NA) # "logical"
+# Use | and & for logic operations.
+# OR
+TRUE | FALSE # TRUE
+# AND
+TRUE & FALSE # FALSE
+# Applying | and & to vectors returns elementwise logic operations
+c(TRUE,FALSE,FALSE) | c(FALSE,TRUE,FALSE) # TRUE TRUE FALSE
+c(TRUE,FALSE,TRUE) & c(FALSE,TRUE,TRUE) # FALSE FALSE TRUE
+# You can test if x is TRUE
+isTRUE(TRUE) # TRUE
# Here we get a logical vector with many elements:
c('Z', 'o', 'r', 'r', 'o') == "Zorro" # FALSE FALSE FALSE FALSE FALSE
c('Z', 'o', 'r', 'r', 'o') == "Z" # TRUE FALSE FALSE FALSE FALSE
@@ -252,9 +272,10 @@ levels(infert$education) # "0-5yrs" "6-11yrs" "12+ yrs"
# NULL
# "NULL" is a weird one; use it to "blank out" a vector
class(NULL) # NULL
+parakeet = c("beak", "feathers", "wings", "eyes")
parakeet
# =>
-# [1] "beak" "feathers" "wings" "eyes"
+# [1] "beak" "feathers" "wings" "eyes"
parakeet <- NULL
parakeet
# =>
@@ -271,7 +292,7 @@ as.numeric("Bilbo")
# =>
# [1] NA
# Warning message:
-# NAs introduced by coercion
+# NAs introduced by coercion
# Also note: those were just the basic data types
# There are many more data types, such as for dates, time series, etc.
@@ -411,10 +432,10 @@ mat %*% t(mat)
mat2 <- cbind(1:4, c("dog", "cat", "bird", "dog"))
mat2
# =>
-# [,1] [,2]
-# [1,] "1" "dog"
-# [2,] "2" "cat"
-# [3,] "3" "bird"
+# [,1] [,2]
+# [1,] "1" "dog"
+# [2,] "2" "cat"
+# [3,] "3" "bird"
# [4,] "4" "dog"
class(mat2) # matrix
# Again, note what happened!
@@ -656,15 +677,101 @@ write.csv(pets, "pets2.csv") # to make a new .csv file
#########################
+# Statistical Analysis
+#########################
+
+# Linear regression!
+linearModel <- lm(price ~ time, data = list1)
+linearModel # outputs result of regression
+# =>
+# Call:
+# lm(formula = price ~ time, data = list1)
+#
+# Coefficients:
+# (Intercept) time
+# 0.1453 0.4943
+summary(linearModel) # more verbose output from the regression
+# =>
+# Call:
+# lm(formula = price ~ time, data = list1)
+#
+# Residuals:
+# Min 1Q Median 3Q Max
+# -8.3134 -3.0131 -0.3606 2.8016 10.3992
+#
+# Coefficients:
+# Estimate Std. Error t value Pr(>|t|)
+# (Intercept) 0.14527 1.50084 0.097 0.923
+# time 0.49435 0.06379 7.749 2.44e-09 ***
+# ---
+# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+#
+# Residual standard error: 4.657 on 38 degrees of freedom
+# Multiple R-squared: 0.6124, Adjusted R-squared: 0.6022
+# F-statistic: 60.05 on 1 and 38 DF, p-value: 2.44e-09
+coef(linearModel) # extract estimated parameters
+# =>
+# (Intercept) time
+# 0.1452662 0.4943490
+summary(linearModel)$coefficients # another way to extract results
+# =>
+# Estimate Std. Error t value Pr(>|t|)
+# (Intercept) 0.1452662 1.50084246 0.09678975 9.234021e-01
+# time 0.4943490 0.06379348 7.74920901 2.440008e-09
+summary(linearModel)$coefficients[,4] # the p-values
+# =>
+# (Intercept) time
+# 9.234021e-01 2.440008e-09
+
+# GENERAL LINEAR MODELS
+# Logistic regression
+set.seed(1)
+list1$success = rbinom(length(list1$time), 1, .5) # random binary
+glModel <- glm(success ~ time, data = list1,
+ family=binomial(link="logit"))
+glModel # outputs result of logistic regression
+# =>
+# Call: glm(formula = success ~ time,
+# family = binomial(link = "logit"), data = list1)
+#
+# Coefficients:
+# (Intercept) time
+# 0.17018 -0.01321
+#
+# Degrees of Freedom: 39 Total (i.e. Null); 38 Residual
+# Null Deviance: 55.35
+# Residual Deviance: 55.12 AIC: 59.12
+summary(glModel) # more verbose output from the regression
+# =>
+# Call:
+# glm(formula = success ~ time,
+# family = binomial(link = "logit"), data = list1)
+
+# Deviance Residuals:
+# Min 1Q Median 3Q Max
+# -1.245 -1.118 -1.035 1.202 1.327
+#
+# Coefficients:
+# Estimate Std. Error z value Pr(>|z|)
+# (Intercept) 0.17018 0.64621 0.263 0.792
+# time -0.01321 0.02757 -0.479 0.632
+#
+# (Dispersion parameter for binomial family taken to be 1)
+#
+# Null deviance: 55.352 on 39 degrees of freedom
+# Residual deviance: 55.121 on 38 degrees of freedom
+# AIC: 59.121
+#
+# Number of Fisher Scoring iterations: 3
+
+
+#########################
# Plots
#########################
# BUILT-IN PLOTTING FUNCTIONS
# Scatterplots!
plot(list1$time, list1$price, main = "fake data")
-# Regressions!
-linearModel <- lm(price ~ time, data = list1)
-linearModel # outputs result of regression
# Plot regression line on existing plot
abline(linearModel, col = "red")
# Get a variety of nice diagnostics