summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authore99n09 <ysiioj81pcqu@lavabit.com>2014-05-24 08:54:39 -0400
committere99n09 <ysiioj81pcqu@lavabit.com>2014-05-24 08:54:39 -0400
commita8d8cee0d82a8f8be48f3a80c307223856764a31 (patch)
treebd8efc75099207901dfb9e8cb29002ef86736a27
parent31c74615e6a492db39e1586c3d4aaa7c4ada5e56 (diff)
Update r.html.markdown
Minor changes to comments (fixing typos, etc.). Deleted "weird types" section; broke out "NULL" type into its own type category. Added instructions for dropping rows and columns in data.frame and data.table. How to make summary tables in data.table.
-rw-r--r--r.html.markdown93
1 files changed, 69 insertions, 24 deletions
diff --git a/r.html.markdown b/r.html.markdown
index dfc945c1..cd09e8da 100644
--- a/r.html.markdown
+++ b/r.html.markdown
@@ -188,7 +188,7 @@ class(-Inf) # "numeric"
10L + 66L # 76 # integer plus integer gives integer
53.2 - 4 # 49.2 # numeric minus numeric gives numeric
2.0 * 2L # 4 # numeric times integer gives numeric
-3L / 4 # 0.75 # integer over integer gives numeric
+3L / 4 # 0.75 # integer over numeric gives numeric
3 %% 2 # 1 # the remainder of two numerics is another numeric
# Illegal arithmetic yeilds you a "not-a-number":
0 / 0 # NaN
@@ -241,27 +241,29 @@ factor(c("female", "female", "male", "NA", "female"))
# Levels: female male NA
# The "levels" are the values the categorical data can take
levels(factor(c("male", "male", "female", "NA", "female"))) # "female" "male" "NA"
-# If a factor has length 1, its levels will have length 1, too
+# If a factor vector has length 1, its levels will have length 1, too
length(factor("male")) # 1
length(levels(factor("male"))) # 1
# Factors are commonly seen in data frames, a data structure we will cover later
-# in this tutorial:
data(infert) # "Infertility after Spontaneous and Induced Abortion"
levels(infert$education) # "0-5yrs" "6-11yrs" "12+ yrs"
-# WEIRD TYPES
-# A quick summary of some of the weirder types in R
-class(Inf) # "numeric"
-class(-Inf) # "numeric"
-class(NaN) # "numeric"
-class(NA) # "logical"
+# NULL
+# "NULL" is a weird one; use it to "blank out" a vector
class(NULL) # NULL
+parakeet
+# =>
+# [1] "beak" "feathers" "wings" "eyes"
+parakeet <- NULL
+parakeet
+# =>
+# NULL
# TYPE COERCION
# Type-coercion is when you force a value to take on a different type
as.character(c(6, 8)) # "6" "8"
as.logical(c(1,0,1,1)) # TRUE FALSE TRUE TRUE
-# If you put elements of different classes into a vector, weird coercions happen:
+# If you put elements of different types into a vector, weird coercions happen:
c(TRUE, 4) # 1 4
c("dog", TRUE, 4) # "dog" "TRUE" "4"
as.numeric("Bilbo")
@@ -332,8 +334,6 @@ jiggle(5) # 5±ε. After set.seed(2716057), jiggle(5)==5.005043
# ONE-DIMENSIONAL
# Let's start from the very beginning, and with something you already know: vectors.
-# As explained above, every single element in R is already a vector
-# Make sure the elements of long vectors all have the same type
vec <- c(8, 9, 10, 11)
vec # 8 9 10 11
# We ask for specific elements by subsetting with square brackets
@@ -345,9 +345,9 @@ month.name[9] # "September"
c(6, 8, 7, 5, 3, 0, 9)[3] # 7
# We can also search for the indices of specific components,
which(vec %% 2 == 0) # 1 3
-# grab just the first or last entry in the vector,
+# grab just the first or last few entries in the vector,
head(vec, 1) # 8
-tail(vec, 1) # 11
+tail(vec, w) # 10 11
# or figure out if a certain value is in the vector
any(vec == 10) # TRUE
# If an index "goes over" you'll get NA:
@@ -358,7 +358,7 @@ length(vec) # 4
vec * 4 # 16 20 24 28
vec[2:3] * 5 # 25 30
any(vec[2:3] == 8) # FALSE
-# and there are many built-in functions to summarize vectors
+# and R has many built-in functions to summarize vectors
mean(vec) # 9.5
var(vec) # 1.666667
sd(vec) # 1.290994
@@ -368,6 +368,7 @@ sum(vec) # 38
# Some more nice built-ins:
5:15 # 5 6 7 8 9 10 11 12 13 14 15
seq(from=0, to=31337, by=1337)
+# =>
# [1] 0 1337 2674 4011 5348 6685 8022 9359 10696 12033 13370 14707
# [13] 16044 17381 18718 20055 21392 22729 24066 25403 26740 28077 29414 30751
@@ -427,11 +428,11 @@ mat3
# [,1] [,2] [,3] [,4]
# [1,] 1 2 4 5
# [2,] 6 7 0 4
-# Aah, everything of the same class. No coercions. Much better.
+# Ah, everything of the same class. No coercions. Much better.
# TWO-DIMENSIONAL (DIFFERENT CLASSES)
-# For columns of different classes, use the data frame
+# For columns of different types, use a data frame
# This data structure is so useful for statistical programming,
# a version of it was added to Python in the package "pandas".
@@ -465,11 +466,11 @@ students$year # 3 2 2 1 0 -1
students[,2] # 3 2 2 1 0 -1
students[,"year"] # 3 2 2 1 0 -1
-# A popular replacement for the data.frame structure is the data.table
+# An augmented version of the data.frame structure is the data.table
# If you're working with huge or panel data, or need to merge a few data
# sets, data.table can be a good choice. Here's a whirlwind tour:
-install.packages("data.table")
-require(data.table)
+install.packages("data.table") # download the package from CRAN
+require(data.table) # load it
students <- as.data.table(students)
students # note the slightly different print-out
# =>
@@ -480,15 +481,17 @@ students # note the slightly different print-out
# 4: Cho 1 R
# 5: Draco 0 S
# 6: Ginny -1 G
-students[name=="Ginny"]
+students[name=="Ginny"] # get rows with name == "Ginny"
# =>
# name year house
# 1: Ginny -1 G
-students[year==2]
+students[year==2] # get rows with year == 2
# =>
# name year house
# 1: Fred 2 G
# 2: George 2 G
+# data.table makes merging two data sets easy
+# let's make another data.table to merge with students
founders <- data.table(house=c("G","H","R","S"),
founder=c("Godric","Helga","Rowena","Salazar"))
founders
@@ -500,7 +503,7 @@ founders
# 4: S Salazar
setkey(students, house)
setkey(founders, house)
-students <- founders[students] # merge the two data sets
+students <- founders[students] # merge the two data sets by matching "house"
setnames(students, c("house","houseFounderName","studentName","year"))
students[,order(c("name","year","house","houseFounderName")), with=F]
# =>
@@ -512,9 +515,51 @@ students[,order(c("name","year","house","houseFounderName")), with=F]
# 5: Cho 1 R Rowena
# 6: Draco 0 S Salazar
-# MULTI-DIMENSIONAL (ALL OF ONE CLASS)
+# data.table makes summary tables easy
+# =>
+# students[,sum(year),by=house]
+# house V1
+# 1: G 3
+# 2: H 3
+# 3: R 1
+# 4: S 0
+
+# To drop a column from a data.frame or data.table,
+# assign it the NULL value
+students$houseFounderName <- NULL
+students
+# =>
+# studentName year house
+# 1: Fred 2 G
+# 2: George 2 G
+# 3: Ginny -1 G
+# 4: Cedric 3 H
+# 5: Cho 1 R
+# 6: Draco 0 S
+
+# Drop a row by subsetting
+# Using data.table:
+students[studentName != "Draco"]
+# =>
+# house studentName year
+# 1: G Fred 2
+# 2: G George 2
+# 3: G Ginny -1
+# 4: H Cedric 3
+# 5: R Cho 1
+# Using data.frame:
+students <- as.data.frame(students)
+students[students$house != "G",]
+# =>
+# house houseFounderName studentName year
+# 4 H Helga Cedric 3
+# 5 R Rowena Cho 1
+# 6 S Salazar Draco 0
+
+# MULTI-DIMENSIONAL (ALL ELEMENTS OF ONE TYPE)
# Arrays creates n-dimensional tables
+# All elements must be of the same type
# You can make a two-dimensional table (sort of like a matrix)
array(c(c(1,2,4,5),c(8,9,3,6)), dim=c(2,4))
# =>