From 8b3cc63b3e3441b8a8f73a5983f0de0fdd10cf02 Mon Sep 17 00:00:00 2001
From: Adam <adam@adambard.com>
Date: Mon, 19 Oct 2015 14:33:11 +0800
Subject: Fixed pythonstatcomp doc naming

---
 pythonstatcomp.html.markdown | 234 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 pythonstatcomp.html.markdown

(limited to 'pythonstatcomp.html.markdown')

diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown
new file mode 100644
index 00000000..78b62e33
--- /dev/null
+++ b/pythonstatcomp.html.markdown
@@ -0,0 +1,234 @@
+---
+language: Statistical computing with Python
+contributors:
+    - ["e99n09", "https://github.com/e99n09"]
+filename: pythonstatcomp.py
+---
+
+This is a tutorial on how to do some typical statistical programming tasks using Python. It's intended for people basically familiar with Python and experienced at statistical programming in a language like R, Stata, SAS, SPSS, or MATLAB.
+
+```python
+
+# 0. Getting set up ====
+
+""" Get set up with IPython and pip install the following: numpy, scipy, pandas,
+    matplotlib, seaborn, requests.
+        Make sure to do this tutorial in the IPython notebook so that you get
+    the inline plots and easy documentation lookup.
+"""
+
+# 1. Data acquisition ====
+
+""" One reason people choose Python over R is that they intend to interact a lot
+    with the web, either by scraping pages directly or requesting data through
+    an API. You can do those things in R, but in the context of a project
+    already using Python, there's a benefit to sticking with one language.
+"""
+
+import requests # for HTTP requests (web scraping, APIs)
+import os
+
+# web scraping
+r = requests.get("https://github.com/adambard/learnxinyminutes-docs")
+r.status_code # if 200, request was successful
+r.text # raw page source
+print(r.text) # prettily formatted
+# save the page source in a file:
+os.getcwd() # check what's the working directory
+f = open("learnxinyminutes.html","wb")
+f.write(r.text.encode("UTF-8"))
+f.close()
+
+# downloading a csv
+fp = "https://raw.githubusercontent.com/adambard/learnxinyminutes-docs/master/"
+fn = "pets.csv"
+r = requests.get(fp + fn)
+print(r.text)
+f = open(fn,"wb")
+f.write(r.text.encode("UTF-8"))
+f.close()
+
+""" for more on the requests module, including APIs, see
+    http://docs.python-requests.org/en/latest/user/quickstart/
+"""
+
+# 2. Reading a CSV file ====
+
+""" Wes McKinney's pandas package gives you 'DataFrame' objects in Python. If
+    you've used R, you will be familiar with the idea of the "data.frame" already.
+"""
+
+import pandas as pd, numpy as np, scipy as sp
+pets = pd.read_csv(fn)
+pets
+#        name  age  weight species
+# 0    fluffy    3      14     cat
+# 1  vesuvius    6      23    fish
+# 2       rex    5      34     dog
+
+""" R users: note that Python, like most normal programming languages, starts
+    indexing from 0. R is the unusual one for starting from 1.
+"""
+
+# two different ways to print out a column
+pets.age
+pets["age"]
+
+pets.head(2) # prints first 2 rows
+pets.tail(1) # prints last row
+
+pets.name[1] # 'vesuvius'
+pets.species[0] # 'cat'
+pets["weight"][2] # 34
+
+# in R, you would expect to get 3 rows doing this, but here you get 2:
+pets.age[0:2]
+# 0    3
+# 1    6
+
+sum(pets.age)*2 # 28
+max(pets.weight) - min(pets.weight) # 20
+
+""" If you are doing some serious linear algebra and number-crunching, you may
+    just want arrays, not DataFrames. DataFrames are ideal for combining columns
+    of different types.
+"""
+
+# 3. Charts ====
+
+import matplotlib as mpl, matplotlib.pyplot as plt
+%matplotlib inline
+
+# To do data vizualization in Python, use matplotlib
+
+plt.hist(pets.age);
+
+plt.boxplot(pets.weight);
+
+plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight");
+
+# seaborn sits atop matplotlib and makes plots prettier
+
+import seaborn as sns
+
+plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight");
+
+# there are also some seaborn-specific plotting functions
+# notice how seaborn automatically labels the x-axis on this barplot
+sns.barplot(pets["age"])
+
+# R veterans can still use ggplot
+from ggplot import *
+ggplot(aes(x="age",y="weight"), data=pets) + geom_point() + labs(title="pets")
+# source: https://pypi.python.org/pypi/ggplot
+
+# there's even a d3.js port: https://github.com/mikedewar/d3py
+
+# 4. Simple data cleaning and exploratory analysis ====
+
+""" Here's a more complicated example that demonstrates a basic data
+    cleaning workflow leading to the creation of some exploratory plots
+    and the running of a linear regression.
+        The data set was transcribed from Wikipedia by hand. It contains
+    all the Holy Roman Emperors and the important milestones in their lives
+    (birth, death, coronation, etc.).
+        The goal of the analysis will be to explore whether a relationship
+    exists between emperor birth year and emperor lifespan.
+    data source: https://en.wikipedia.org/wiki/Holy_Roman_Emperor
+"""
+
+# load some data on Holy Roman Emperors
+url = "https://raw.githubusercontent.com/e99n09/R-notes/master/data/hre.csv"
+r = requests.get(url)
+fp = "hre.csv"
+f = open(fp,"wb")
+f.write(r.text.encode("UTF-8"))
+f.close()
+
+hre = pd.read_csv(fp)
+
+hre.head()
+"""
+   Ix      Dynasty        Name        Birth             Death Election 1  
+0 NaN  Carolingian   Charles I  2 April 742    28 January 814        NaN   
+1 NaN  Carolingian     Louis I          778       20 June 840        NaN   
+2 NaN  Carolingian   Lothair I          795  29 September 855        NaN   
+3 NaN  Carolingian    Louis II          825     12 August 875        NaN   
+4 NaN  Carolingian  Charles II  13 June 823     6 October 877        NaN   
+
+  Election 2      Coronation 1   Coronation 2 Ceased to be Emperor  
+0        NaN   25 December 800            NaN       28 January 814   
+1        NaN  11 September 813  5 October 816          20 June 840   
+2        NaN       5 April 823            NaN     29 September 855   
+3        NaN        Easter 850     18 May 872        12 August 875   
+4        NaN   29 December 875            NaN        6 October 877   
+
+  Descent from whom 1 Descent how 1 Descent from whom 2 Descent how 2  
+0                 NaN           NaN                 NaN           NaN  
+1           Charles I           son                 NaN           NaN  
+2             Louis I           son                 NaN           NaN  
+3           Lothair I           son                 NaN           NaN  
+4             Louis I           son                 NaN           NaN  
+"""
+
+# clean the Birth and Death columns
+
+import re # module for regular expressions
+
+rx = re.compile(r'\d+$') # match trailing digits
+
+""" This function applies the regular expression to an input column (here Birth,
+    Death), flattens the resulting list, converts it to a Series object, and
+    finally converts the type of the Series object from string to integer. For
+    more information into what different parts of the code do, see:
+      - https://docs.python.org/2/howto/regex.html
+      - http://stackoverflow.com/questions/11860476/how-to-unlist-a-python-list
+      - http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
+"""
+def extractYear(v):
+    return(pd.Series(reduce(lambda x,y: x+y,map(rx.findall,v),[])).astype(int))
+
+hre["BirthY"] = extractYear(hre.Birth)
+hre["DeathY"] = extractYear(hre.Death)
+
+# make a column telling estimated age
+hre["EstAge"] = hre.DeathY.astype(int) - hre.BirthY.astype(int)
+
+# simple scatterplot, no trend line, color represents dynasty
+sns.lmplot("BirthY", "EstAge", data=hre, hue="Dynasty", fit_reg=False);
+
+# use scipy to run a linear regression
+from scipy import stats
+(slope,intercept,rval,pval,stderr)=stats.linregress(hre.BirthY,hre.EstAge)
+# code source: http://wiki.scipy.org/Cookbook/LinearRegression
+
+# check the slope
+slope # 0.0057672618839073328
+
+# check the R^2 value:
+rval**2 # 0.020363950027333586
+
+# check the p-value
+pval # 0.34971812581498452
+
+# use seaborn to make a scatterplot and plot the linear regression trend line
+sns.lmplot("BirthY", "EstAge", data=hre);
+
+""" For more information on seaborn, see
+      - http://web.stanford.edu/~mwaskom/software/seaborn/
+      - https://github.com/mwaskom/seaborn
+    For more information on SciPy, see
+      - http://wiki.scipy.org/SciPy
+      - http://wiki.scipy.org/Cookbook/
+    To see a version of the Holy Roman Emperors analysis using R, see
+      - http://github.com/e99n09/R-notes/blob/master/holy_roman_emperors_dates.R
+"""
+```
+
+If you want to learn more, get _Python for Data Analysis_ by Wes McKinney. It's a superb resource and I used it as a reference when writing this tutorial.
+
+You can also find plenty of interactive IPython tutorials on subjects specific to your interests, like Cam Davidson-Pilon's <a href="http://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/" Title="Probabilistic Programming and Bayesian Methods for Hackers">Probabilistic Programming and Bayesian Methods for Hackers</a>.
+
+Some more modules to research:
+   - text analysis and natural language processing: nltk, http://www.nltk.org
+   - social network analysis: igraph, http://igraph.org/python/
-- 
cgit v1.2.3


From bde8645cc7bb7f0a88b5d106cd0bd0b7e40886d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Migda=C5=82?= <pmigdal@gmail.com>
Date: Sun, 3 Jan 2016 19:45:54 +0100
Subject: pep8 fixes (spaces and multiline statements)

in Python readability and code style matters
---
 pythonstatcomp.html.markdown | 103 ++++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 46 deletions(-)

(limited to 'pythonstatcomp.html.markdown')

diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown
index 78b62e33..f8d83b98 100644
--- a/pythonstatcomp.html.markdown
+++ b/pythonstatcomp.html.markdown
@@ -9,6 +9,8 @@ This is a tutorial on how to do some typical statistical programming tasks using
 
 ```python
 
+
+
 # 0. Getting set up ====
 
 """ Get set up with IPython and pip install the following: numpy, scipy, pandas,
@@ -25,17 +27,17 @@ This is a tutorial on how to do some typical statistical programming tasks using
     already using Python, there's a benefit to sticking with one language.
 """
 
-import requests # for HTTP requests (web scraping, APIs)
+import requests  # for HTTP requests (web scraping, APIs)
 import os
 
 # web scraping
 r = requests.get("https://github.com/adambard/learnxinyminutes-docs")
-r.status_code # if 200, request was successful
-r.text # raw page source
-print(r.text) # prettily formatted
+r.status_code  # if 200, request was successful
+r.text  # raw page source
+print(r.text)  # prettily formatted
 # save the page source in a file:
-os.getcwd() # check what's the working directory
-f = open("learnxinyminutes.html","wb")
+os.getcwd()  # check what's the working directory
+f = open("learnxinyminutes.html", "wb")
 f.write(r.text.encode("UTF-8"))
 f.close()
 
@@ -44,7 +46,7 @@ fp = "https://raw.githubusercontent.com/adambard/learnxinyminutes-docs/master/"
 fn = "pets.csv"
 r = requests.get(fp + fn)
 print(r.text)
-f = open(fn,"wb")
+f = open(fn, "wb")
 f.write(r.text.encode("UTF-8"))
 f.close()
 
@@ -58,7 +60,9 @@ f.close()
     you've used R, you will be familiar with the idea of the "data.frame" already.
 """
 
-import pandas as pd, numpy as np, scipy as sp
+import pandas as pd
+import numpy as np
+import scipy as sp
 pets = pd.read_csv(fn)
 pets
 #        name  age  weight species
@@ -74,20 +78,20 @@ pets
 pets.age
 pets["age"]
 
-pets.head(2) # prints first 2 rows
-pets.tail(1) # prints last row
+pets.head(2)  # prints first 2 rows
+pets.tail(1)  # prints last row
 
-pets.name[1] # 'vesuvius'
-pets.species[0] # 'cat'
-pets["weight"][2] # 34
+pets.name[1]  # 'vesuvius'
+pets.species[0]  # 'cat'
+pets["weight"][2]  # 34
 
 # in R, you would expect to get 3 rows doing this, but here you get 2:
 pets.age[0:2]
 # 0    3
 # 1    6
 
-sum(pets.age)*2 # 28
-max(pets.weight) - min(pets.weight) # 20
+sum(pets.age) * 2  # 28
+max(pets.weight) - min(pets.weight)  # 20
 
 """ If you are doing some serious linear algebra and number-crunching, you may
     just want arrays, not DataFrames. DataFrames are ideal for combining columns
@@ -96,7 +100,8 @@ max(pets.weight) - min(pets.weight) # 20
 
 # 3. Charts ====
 
-import matplotlib as mpl, matplotlib.pyplot as plt
+import matplotlib as mpl
+import matplotlib.pyplot as plt
 %matplotlib inline
 
 # To do data vizualization in Python, use matplotlib
@@ -105,13 +110,17 @@ plt.hist(pets.age);
 
 plt.boxplot(pets.weight);
 
-plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight");
+plt.scatter(pets.age, pets.weight)
+plt.xlabel("age")
+plt.ylabel("weight");
 
 # seaborn sits atop matplotlib and makes plots prettier
 
 import seaborn as sns
 
-plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight");
+plt.scatter(pets.age, pets.weight)
+plt.xlabel("age")
+plt.ylabel("weight");
 
 # there are also some seaborn-specific plotting functions
 # notice how seaborn automatically labels the x-axis on this barplot
@@ -141,7 +150,7 @@ ggplot(aes(x="age",y="weight"), data=pets) + geom_point() + labs(title="pets")
 url = "https://raw.githubusercontent.com/e99n09/R-notes/master/data/hre.csv"
 r = requests.get(url)
 fp = "hre.csv"
-f = open(fp,"wb")
+f = open(fp, "wb")
 f.write(r.text.encode("UTF-8"))
 f.close()
 
@@ -149,33 +158,33 @@ hre = pd.read_csv(fp)
 
 hre.head()
 """
-   Ix      Dynasty        Name        Birth             Death Election 1  
-0 NaN  Carolingian   Charles I  2 April 742    28 January 814        NaN   
-1 NaN  Carolingian     Louis I          778       20 June 840        NaN   
-2 NaN  Carolingian   Lothair I          795  29 September 855        NaN   
-3 NaN  Carolingian    Louis II          825     12 August 875        NaN   
-4 NaN  Carolingian  Charles II  13 June 823     6 October 877        NaN   
-
-  Election 2      Coronation 1   Coronation 2 Ceased to be Emperor  
-0        NaN   25 December 800            NaN       28 January 814   
-1        NaN  11 September 813  5 October 816          20 June 840   
-2        NaN       5 April 823            NaN     29 September 855   
-3        NaN        Easter 850     18 May 872        12 August 875   
-4        NaN   29 December 875            NaN        6 October 877   
-
-  Descent from whom 1 Descent how 1 Descent from whom 2 Descent how 2  
-0                 NaN           NaN                 NaN           NaN  
-1           Charles I           son                 NaN           NaN  
-2             Louis I           son                 NaN           NaN  
-3           Lothair I           son                 NaN           NaN  
-4             Louis I           son                 NaN           NaN  
+   Ix      Dynasty        Name        Birth             Death Election 1
+0 NaN  Carolingian   Charles I  2 April 742    28 January 814        NaN
+1 NaN  Carolingian     Louis I          778       20 June 840        NaN
+2 NaN  Carolingian   Lothair I          795  29 September 855        NaN
+3 NaN  Carolingian    Louis II          825     12 August 875        NaN
+4 NaN  Carolingian  Charles II  13 June 823     6 October 877        NaN
+
+  Election 2      Coronation 1   Coronation 2 Ceased to be Emperor
+0        NaN   25 December 800            NaN       28 January 814
+1        NaN  11 September 813  5 October 816          20 June 840
+2        NaN       5 April 823            NaN     29 September 855
+3        NaN        Easter 850     18 May 872        12 August 875
+4        NaN   29 December 875            NaN        6 October 877
+
+  Descent from whom 1 Descent how 1 Descent from whom 2 Descent how 2
+0                 NaN           NaN                 NaN           NaN
+1           Charles I           son                 NaN           NaN
+2             Louis I           son                 NaN           NaN
+3           Lothair I           son                 NaN           NaN
+4             Louis I           son                 NaN           NaN
 """
 
 # clean the Birth and Death columns
 
-import re # module for regular expressions
+import re  # module for regular expressions
 
-rx = re.compile(r'\d+$') # match trailing digits
+rx = re.compile(r'\d+$')  # match trailing digits
 
 """ This function applies the regular expression to an input column (here Birth,
     Death), flattens the resulting list, converts it to a Series object, and
@@ -185,8 +194,9 @@ rx = re.compile(r'\d+$') # match trailing digits
       - http://stackoverflow.com/questions/11860476/how-to-unlist-a-python-list
       - http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
 """
+
 def extractYear(v):
-    return(pd.Series(reduce(lambda x,y: x+y,map(rx.findall,v),[])).astype(int))
+    return(pd.Series(reduce(lambda x, y: x + y, map(rx.findall, v), [])).astype(int))
 
 hre["BirthY"] = extractYear(hre.Birth)
 hre["DeathY"] = extractYear(hre.Death)
@@ -199,17 +209,17 @@ sns.lmplot("BirthY", "EstAge", data=hre, hue="Dynasty", fit_reg=False);
 
 # use scipy to run a linear regression
 from scipy import stats
-(slope,intercept,rval,pval,stderr)=stats.linregress(hre.BirthY,hre.EstAge)
+(slope, intercept, rval, pval, stderr) = stats.linregress(hre.BirthY, hre.EstAge)
 # code source: http://wiki.scipy.org/Cookbook/LinearRegression
 
 # check the slope
-slope # 0.0057672618839073328
+slope  # 0.0057672618839073328
 
 # check the R^2 value:
-rval**2 # 0.020363950027333586
+rval**2  # 0.020363950027333586
 
 # check the p-value
-pval # 0.34971812581498452
+pval  # 0.34971812581498452
 
 # use seaborn to make a scatterplot and plot the linear regression trend line
 sns.lmplot("BirthY", "EstAge", data=hre);
@@ -223,6 +233,7 @@ sns.lmplot("BirthY", "EstAge", data=hre);
     To see a version of the Holy Roman Emperors analysis using R, see
       - http://github.com/e99n09/R-notes/blob/master/holy_roman_emperors_dates.R
 """
+
 ```
 
 If you want to learn more, get _Python for Data Analysis_ by Wes McKinney. It's a superb resource and I used it as a reference when writing this tutorial.
-- 
cgit v1.2.3


From ecba01b4b1c35cbcf58c5e030b9e79a5c30be55c Mon Sep 17 00:00:00 2001
From: Braxton Fair <braxton@hexel.xyz>
Date: Sun, 28 Feb 2016 20:33:57 -0600
Subject: Write a bit better code

---
 pythonstatcomp.html.markdown | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'pythonstatcomp.html.markdown')

diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown
index f8d83b98..0b02dca8 100644
--- a/pythonstatcomp.html.markdown
+++ b/pythonstatcomp.html.markdown
@@ -150,9 +150,8 @@ ggplot(aes(x="age",y="weight"), data=pets) + geom_point() + labs(title="pets")
 url = "https://raw.githubusercontent.com/e99n09/R-notes/master/data/hre.csv"
 r = requests.get(url)
 fp = "hre.csv"
-f = open(fp, "wb")
-f.write(r.text.encode("UTF-8"))
-f.close()
+with open(fp, "wb") as f:
+    f.write(r.text.encode("UTF-8"))
 
 hre = pd.read_csv(fp)
 
-- 
cgit v1.2.3


From 90f68920b97ec6384933de323e8d7f260d3ae944 Mon Sep 17 00:00:00 2001
From: Jason Kinetec <acidflip@users.noreply.github.com>
Date: Fri, 18 Mar 2016 12:29:29 -0400
Subject: Update pythonstatcomp.html.markdown

---
 pythonstatcomp.html.markdown | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'pythonstatcomp.html.markdown')

diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown
index 0b02dca8..7cf6862e 100644
--- a/pythonstatcomp.html.markdown
+++ b/pythonstatcomp.html.markdown
@@ -1,8 +1,9 @@
 ---
-language: Statistical computing with Python
+category: tool
+tool: Statistical Computing with Python
 contributors:
     - ["e99n09", "https://github.com/e99n09"]
-filename: pythonstatcomp.py
+    - ["acidflip", "https://github.com/acidflip"]
 ---
 
 This is a tutorial on how to do some typical statistical programming tasks using Python. It's intended for people basically familiar with Python and experienced at statistical programming in a language like R, Stata, SAS, SPSS, or MATLAB.
-- 
cgit v1.2.3


From 12f0b9442a680e27ec23504e5f43e13143c121b4 Mon Sep 17 00:00:00 2001
From: Jason Kinetec <acidflip@users.noreply.github.com>
Date: Fri, 18 Mar 2016 14:07:03 -0400
Subject: Update pythonstatcomp.html.markdown

---
 pythonstatcomp.html.markdown | 1 -
 1 file changed, 1 deletion(-)

(limited to 'pythonstatcomp.html.markdown')

diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown
index 7cf6862e..8ee3aa64 100644
--- a/pythonstatcomp.html.markdown
+++ b/pythonstatcomp.html.markdown
@@ -3,7 +3,6 @@ category: tool
 tool: Statistical Computing with Python
 contributors:
     - ["e99n09", "https://github.com/e99n09"]
-    - ["acidflip", "https://github.com/acidflip"]
 ---
 
 This is a tutorial on how to do some typical statistical programming tasks using Python. It's intended for people basically familiar with Python and experienced at statistical programming in a language like R, Stata, SAS, SPSS, or MATLAB.
-- 
cgit v1.2.3


From 985d23a52b76593a120adff5381c2df3a80fe298 Mon Sep 17 00:00:00 2001
From: HairyFotr <hairyfotr@gmail.com>
Date: Wed, 23 Aug 2017 10:14:39 +0200
Subject: Fix a bunch of typos

---
 pythonstatcomp.html.markdown | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pythonstatcomp.html.markdown')

diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown
index 8ee3aa64..79bbcd8d 100644
--- a/pythonstatcomp.html.markdown
+++ b/pythonstatcomp.html.markdown
@@ -104,7 +104,7 @@ import matplotlib as mpl
 import matplotlib.pyplot as plt
 %matplotlib inline
 
-# To do data vizualization in Python, use matplotlib
+# To do data visualization in Python, use matplotlib
 
 plt.hist(pets.age);
 
-- 
cgit v1.2.3