diff options
| author | Suzane Sant Ana <tetestonaldo@gmail.com> | 2017-12-31 14:27:06 -0200 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2017-12-31 14:27:06 -0200 | 
| commit | 42f9329bb3a028d374d6397991ac48b44064741e (patch) | |
| tree | 1e75e2b3e122aeb863e3ffa037f6f64c4027fbf8 /r.html.markdown | |
| parent | e6b77595f2669d66ac7be43c6e6083cbff80a9a7 (diff) | |
| parent | 70a36c9bd970b928adde06afb2bd69f6ba8e5d5c (diff) | |
Merge pull request #1 from adambard/master
update
Diffstat (limited to 'r.html.markdown')
| -rw-r--r-- | r.html.markdown | 162 | 
1 files changed, 135 insertions, 27 deletions
| diff --git a/r.html.markdown b/r.html.markdown index c555d748..e7486e60 100644 --- a/r.html.markdown +++ b/r.html.markdown @@ -3,6 +3,7 @@ language: R  contributors:      - ["e99n09", "http://github.com/e99n09"]      - ["isomorphismes", "http://twitter.com/isomorphisms"] +    - ["kalinn", "http://github.com/kalinn"]  filename: learnr.r  --- @@ -15,7 +16,8 @@ R is a statistical computing language. It has lots of libraries for uploading an  # You can't make multi-line comments,  # but you can stack multiple comments like so. -# in Windows or Mac, hit COMMAND-ENTER to execute a line +# in Windows you can use CTRL-ENTER to execute a line. +# on Mac it is COMMAND-ENTER @@ -36,8 +38,8 @@ head(rivers)	# peek at the data set  length(rivers)	# how many rivers were measured?  # 141  summary(rivers) # what are some summary statistics? -#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  -#  135.0   310.0   425.0   591.2   680.0  3710.0  +#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. +#  135.0   310.0   425.0   591.2   680.0  3710.0  # make a stem-and-leaf plot (a histogram-like data visualization)  stem(rivers) @@ -54,14 +56,14 @@ stem(rivers)  #  14 | 56  #  16 | 7  #  18 | 9 -#  20 |  +#  20 |  #  22 | 25  #  24 | 3 -#  26 |  -#  28 |  -#  30 |  -#  32 |  -#  34 |  +#  26 | +#  28 | +#  30 | +#  32 | +#  34 |  #  36 | 1  stem(log(rivers)) # Notice that the data are neither normal nor log-normal! @@ -70,7 +72,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal!  #  The decimal point is 1 digit(s) to the left of the |  #  #  48 | 1 -#  50 |  +#  50 |  #  52 | 15578  #  54 | 44571222466689  #  56 | 023334677000124455789 @@ -85,7 +87,7 @@ stem(log(rivers)) # Notice that the data are neither normal nor log-normal!  #  74 | 84  #  76 | 56  #  78 | 4 -#  80 |  +#  80 |  #  82 | 2  # make a histogram: @@ -108,7 +110,7 @@ sort(discoveries)  # [76]  4  4  4  4  5  5  5  5  5  5  5  6  6  6  6  6  6  7  7  7  7  8  9 10 12  stem(discoveries, scale=2) -#  +#  #  The decimal point is at the |  #  #   0 | 000000000 @@ -122,14 +124,14 @@ stem(discoveries, scale=2)  #   8 | 0  #   9 | 0  #  10 | 0 -#  11 |  +#  11 |  #  12 | 0  max(discoveries)  # 12  summary(discoveries) -#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.  -#    0.0     2.0     3.0     3.1     4.0    12.0  +#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. +#    0.0     2.0     3.0     3.1     4.0    12.0  # Roll a die a few times  round(runif(7, min=.5, max=6.5)) @@ -190,12 +192,20 @@ class(-Inf)	# "numeric"  2.0 * 2L  # 4       # numeric times integer gives numeric  3L / 4    # 0.75    # integer over numeric gives numeric  3 %% 2	  # 1       # the remainder of two numerics is another numeric -# Illegal arithmetic yeilds you a "not-a-number": +# Illegal arithmetic yields you a "not-a-number":  0 / 0 # NaN  class(NaN) # "numeric"  # You can do arithmetic on two vectors with length greater than 1,  # so long as the larger vector's length is an integer multiple of the smaller  c(1,2,3) + c(1,2,3) # 2 4 6 +# Since a single number is a vector of length one, scalars are applied  +# elementwise to vectors +(4 * c(1,2,3) - 2) / 2 # 1 3 5 +# Except for scalars, use caution when performing arithmetic on vectors with  +# different lengths. Although it can be done,  +c(1,2,3,1,2,3) * c(1,2) # 1 4 3 2 2 6 +# Matching lengths is better practice and easier to read +c(1,2,3,1,2,3) * c(1,2,1,2,1,2)   # CHARACTERS  # There's no difference between strings and characters in R @@ -229,6 +239,16 @@ FALSE != FALSE	# FALSE  FALSE != TRUE	# TRUE  # Missing data (NA) is logical, too  class(NA)	# "logical" +# Use | and & for logic operations. +# OR +TRUE | FALSE	# TRUE +# AND +TRUE & FALSE	# FALSE +# Applying | and & to vectors returns elementwise logic operations +c(TRUE,FALSE,FALSE) | c(FALSE,TRUE,FALSE) # TRUE TRUE FALSE +c(TRUE,FALSE,TRUE) & c(FALSE,TRUE,TRUE) # FALSE FALSE TRUE +# You can test if x is TRUE +isTRUE(TRUE)	# TRUE  # Here we get a logical vector with many elements:  c('Z', 'o', 'r', 'r', 'o') == "Zorro" # FALSE FALSE FALSE FALSE FALSE  c('Z', 'o', 'r', 'r', 'o') == "Z" # TRUE FALSE FALSE FALSE FALSE @@ -252,9 +272,10 @@ levels(infert$education) # "0-5yrs"  "6-11yrs" "12+ yrs"  # NULL  # "NULL" is a weird one; use it to "blank out" a vector  class(NULL)	# NULL +parakeet = c("beak", "feathers", "wings", "eyes")  parakeet  # => -# [1] "beak"     "feathers" "wings"    "eyes"     +# [1] "beak"     "feathers" "wings"    "eyes"  parakeet <- NULL  parakeet  # => @@ -271,7 +292,7 @@ as.numeric("Bilbo")  # =>  # [1] NA  # Warning message: -# NAs introduced by coercion  +# NAs introduced by coercion  # Also note: those were just the basic data types  # There are many more data types, such as for dates, time series, etc. @@ -411,10 +432,10 @@ mat %*% t(mat)  mat2 <- cbind(1:4, c("dog", "cat", "bird", "dog"))  mat2  # => -#      [,1] [,2]    -# [1,] "1"  "dog"   -# [2,] "2"  "cat"   -# [3,] "3"  "bird"  +#      [,1] [,2] +# [1,] "1"  "dog" +# [2,] "2"  "cat" +# [3,] "3"  "bird"  # [4,] "4"  "dog"  class(mat2)	# matrix  # Again, note what happened! @@ -641,8 +662,9 @@ require(plyr)  #########################  # "pets.csv" is a file on the internet -# (but it could just as easily be be a file on your own computer) -pets <- read.csv("http://learnxinyminutes.com/docs/pets.csv") +# (but it could just as easily be a file on your own computer) +require(RCurl) +pets <- read.csv(textConnection(getURL("http://learnxinyminutes.com/docs/pets.csv")))  pets  head(pets, 2) # first two rows  tail(pets, 1) # last row @@ -656,15 +678,101 @@ write.csv(pets, "pets2.csv") # to make a new .csv file  ######################### +# Statistical Analysis +######################### + +# Linear regression! +linearModel <- lm(price  ~ time, data = list1) +linearModel # outputs result of regression +# => +# Call: +# lm(formula = price ~ time, data = list1) +#  +# Coefficients: +# (Intercept)         time   +#      0.1453       0.4943   +summary(linearModel) # more verbose output from the regression +# => +# Call: +# lm(formula = price ~ time, data = list1) +# +# Residuals: +#     Min      1Q  Median      3Q     Max  +# -8.3134 -3.0131 -0.3606  2.8016 10.3992  +# +# Coefficients: +#             Estimate Std. Error t value Pr(>|t|)     +# (Intercept)  0.14527    1.50084   0.097    0.923     +# time         0.49435    0.06379   7.749 2.44e-09 *** +# --- +# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +# +# Residual standard error: 4.657 on 38 degrees of freedom +# Multiple R-squared:  0.6124,	Adjusted R-squared:  0.6022  +# F-statistic: 60.05 on 1 and 38 DF,  p-value: 2.44e-09 +coef(linearModel) # extract estimated parameters +# => +# (Intercept)        time  +#   0.1452662   0.4943490  +summary(linearModel)$coefficients # another way to extract results +# => +#              Estimate Std. Error    t value     Pr(>|t|) +# (Intercept) 0.1452662 1.50084246 0.09678975 9.234021e-01 +# time        0.4943490 0.06379348 7.74920901 2.440008e-09 +summary(linearModel)$coefficients[,4] # the p-values  +# => +#  (Intercept)         time  +# 9.234021e-01 2.440008e-09  + +# GENERAL LINEAR MODELS +# Logistic regression +set.seed(1) +list1$success = rbinom(length(list1$time), 1, .5) # random binary +glModel <- glm(success  ~ time, data = list1,  +	family=binomial(link="logit")) +glModel # outputs result of logistic regression +# => +# Call:  glm(formula = success ~ time,  +#	family = binomial(link = "logit"), data = list1) +# +# Coefficients: +# (Intercept)         time   +#     0.17018     -0.01321   +#  +# Degrees of Freedom: 39 Total (i.e. Null);  38 Residual +# Null Deviance:	    55.35  +# Residual Deviance: 55.12 	 AIC: 59.12 +summary(glModel) # more verbose output from the regression +# => +# Call: +# glm(formula = success ~ time,  +#	family = binomial(link = "logit"), data = list1) + +# Deviance Residuals:  +#    Min      1Q  Median      3Q     Max   +# -1.245  -1.118  -1.035   1.202   1.327   +#  +# Coefficients: +#             Estimate Std. Error z value Pr(>|z|) +# (Intercept)  0.17018    0.64621   0.263    0.792 +# time        -0.01321    0.02757  -0.479    0.632 +#  +# (Dispersion parameter for binomial family taken to be 1) +# +#     Null deviance: 55.352  on 39  degrees of freedom +# Residual deviance: 55.121  on 38  degrees of freedom +# AIC: 59.121 +#  +# Number of Fisher Scoring iterations: 3 + + +#########################  # Plots  #########################  # BUILT-IN PLOTTING FUNCTIONS  # Scatterplots!  plot(list1$time, list1$price, main = "fake data") -# Regressions! -linearModel <- lm(price  ~ time, data = list1) -linearModel # outputs result of regression  # Plot regression line on existing plot  abline(linearModel, col = "red")  # Get a variety of nice diagnostics | 
