From 8b3cc63b3e3441b8a8f73a5983f0de0fdd10cf02 Mon Sep 17 00:00:00 2001 From: Adam Date: Mon, 19 Oct 2015 14:33:11 +0800 Subject: Fixed pythonstatcomp doc naming --- pythonstatcomp.html.markdown | 234 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 pythonstatcomp.html.markdown (limited to 'pythonstatcomp.html.markdown') diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown new file mode 100644 index 00000000..78b62e33 --- /dev/null +++ b/pythonstatcomp.html.markdown @@ -0,0 +1,234 @@ +--- +language: Statistical computing with Python +contributors: + - ["e99n09", "https://github.com/e99n09"] +filename: pythonstatcomp.py +--- + +This is a tutorial on how to do some typical statistical programming tasks using Python. It's intended for people basically familiar with Python and experienced at statistical programming in a language like R, Stata, SAS, SPSS, or MATLAB. + +```python + +# 0. Getting set up ==== + +""" Get set up with IPython and pip install the following: numpy, scipy, pandas, + matplotlib, seaborn, requests. + Make sure to do this tutorial in the IPython notebook so that you get + the inline plots and easy documentation lookup. +""" + +# 1. Data acquisition ==== + +""" One reason people choose Python over R is that they intend to interact a lot + with the web, either by scraping pages directly or requesting data through + an API. You can do those things in R, but in the context of a project + already using Python, there's a benefit to sticking with one language. +""" + +import requests # for HTTP requests (web scraping, APIs) +import os + +# web scraping +r = requests.get("https://github.com/adambard/learnxinyminutes-docs") +r.status_code # if 200, request was successful +r.text # raw page source +print(r.text) # prettily formatted +# save the page source in a file: +os.getcwd() # check what's the working directory +f = open("learnxinyminutes.html","wb") +f.write(r.text.encode("UTF-8")) +f.close() + +# downloading a csv +fp = "https://raw.githubusercontent.com/adambard/learnxinyminutes-docs/master/" +fn = "pets.csv" +r = requests.get(fp + fn) +print(r.text) +f = open(fn,"wb") +f.write(r.text.encode("UTF-8")) +f.close() + +""" for more on the requests module, including APIs, see + http://docs.python-requests.org/en/latest/user/quickstart/ +""" + +# 2. Reading a CSV file ==== + +""" Wes McKinney's pandas package gives you 'DataFrame' objects in Python. If + you've used R, you will be familiar with the idea of the "data.frame" already. +""" + +import pandas as pd, numpy as np, scipy as sp +pets = pd.read_csv(fn) +pets +# name age weight species +# 0 fluffy 3 14 cat +# 1 vesuvius 6 23 fish +# 2 rex 5 34 dog + +""" R users: note that Python, like most normal programming languages, starts + indexing from 0. R is the unusual one for starting from 1. +""" + +# two different ways to print out a column +pets.age +pets["age"] + +pets.head(2) # prints first 2 rows +pets.tail(1) # prints last row + +pets.name[1] # 'vesuvius' +pets.species[0] # 'cat' +pets["weight"][2] # 34 + +# in R, you would expect to get 3 rows doing this, but here you get 2: +pets.age[0:2] +# 0 3 +# 1 6 + +sum(pets.age)*2 # 28 +max(pets.weight) - min(pets.weight) # 20 + +""" If you are doing some serious linear algebra and number-crunching, you may + just want arrays, not DataFrames. DataFrames are ideal for combining columns + of different types. +""" + +# 3. Charts ==== + +import matplotlib as mpl, matplotlib.pyplot as plt +%matplotlib inline + +# To do data vizualization in Python, use matplotlib + +plt.hist(pets.age); + +plt.boxplot(pets.weight); + +plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight"); + +# seaborn sits atop matplotlib and makes plots prettier + +import seaborn as sns + +plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight"); + +# there are also some seaborn-specific plotting functions +# notice how seaborn automatically labels the x-axis on this barplot +sns.barplot(pets["age"]) + +# R veterans can still use ggplot +from ggplot import * +ggplot(aes(x="age",y="weight"), data=pets) + geom_point() + labs(title="pets") +# source: https://pypi.python.org/pypi/ggplot + +# there's even a d3.js port: https://github.com/mikedewar/d3py + +# 4. Simple data cleaning and exploratory analysis ==== + +""" Here's a more complicated example that demonstrates a basic data + cleaning workflow leading to the creation of some exploratory plots + and the running of a linear regression. + The data set was transcribed from Wikipedia by hand. It contains + all the Holy Roman Emperors and the important milestones in their lives + (birth, death, coronation, etc.). + The goal of the analysis will be to explore whether a relationship + exists between emperor birth year and emperor lifespan. + data source: https://en.wikipedia.org/wiki/Holy_Roman_Emperor +""" + +# load some data on Holy Roman Emperors +url = "https://raw.githubusercontent.com/e99n09/R-notes/master/data/hre.csv" +r = requests.get(url) +fp = "hre.csv" +f = open(fp,"wb") +f.write(r.text.encode("UTF-8")) +f.close() + +hre = pd.read_csv(fp) + +hre.head() +""" + Ix Dynasty Name Birth Death Election 1 +0 NaN Carolingian Charles I 2 April 742 28 January 814 NaN +1 NaN Carolingian Louis I 778 20 June 840 NaN +2 NaN Carolingian Lothair I 795 29 September 855 NaN +3 NaN Carolingian Louis II 825 12 August 875 NaN +4 NaN Carolingian Charles II 13 June 823 6 October 877 NaN + + Election 2 Coronation 1 Coronation 2 Ceased to be Emperor +0 NaN 25 December 800 NaN 28 January 814 +1 NaN 11 September 813 5 October 816 20 June 840 +2 NaN 5 April 823 NaN 29 September 855 +3 NaN Easter 850 18 May 872 12 August 875 +4 NaN 29 December 875 NaN 6 October 877 + + Descent from whom 1 Descent how 1 Descent from whom 2 Descent how 2 +0 NaN NaN NaN NaN +1 Charles I son NaN NaN +2 Louis I son NaN NaN +3 Lothair I son NaN NaN +4 Louis I son NaN NaN +""" + +# clean the Birth and Death columns + +import re # module for regular expressions + +rx = re.compile(r'\d+$') # match trailing digits + +""" This function applies the regular expression to an input column (here Birth, + Death), flattens the resulting list, converts it to a Series object, and + finally converts the type of the Series object from string to integer. For + more information into what different parts of the code do, see: + - https://docs.python.org/2/howto/regex.html + - http://stackoverflow.com/questions/11860476/how-to-unlist-a-python-list + - http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html +""" +def extractYear(v): + return(pd.Series(reduce(lambda x,y: x+y,map(rx.findall,v),[])).astype(int)) + +hre["BirthY"] = extractYear(hre.Birth) +hre["DeathY"] = extractYear(hre.Death) + +# make a column telling estimated age +hre["EstAge"] = hre.DeathY.astype(int) - hre.BirthY.astype(int) + +# simple scatterplot, no trend line, color represents dynasty +sns.lmplot("BirthY", "EstAge", data=hre, hue="Dynasty", fit_reg=False); + +# use scipy to run a linear regression +from scipy import stats +(slope,intercept,rval,pval,stderr)=stats.linregress(hre.BirthY,hre.EstAge) +# code source: http://wiki.scipy.org/Cookbook/LinearRegression + +# check the slope +slope # 0.0057672618839073328 + +# check the R^2 value: +rval**2 # 0.020363950027333586 + +# check the p-value +pval # 0.34971812581498452 + +# use seaborn to make a scatterplot and plot the linear regression trend line +sns.lmplot("BirthY", "EstAge", data=hre); + +""" For more information on seaborn, see + - http://web.stanford.edu/~mwaskom/software/seaborn/ + - https://github.com/mwaskom/seaborn + For more information on SciPy, see + - http://wiki.scipy.org/SciPy + - http://wiki.scipy.org/Cookbook/ + To see a version of the Holy Roman Emperors analysis using R, see + - http://github.com/e99n09/R-notes/blob/master/holy_roman_emperors_dates.R +""" +``` + +If you want to learn more, get _Python for Data Analysis_ by Wes McKinney. It's a superb resource and I used it as a reference when writing this tutorial. + +You can also find plenty of interactive IPython tutorials on subjects specific to your interests, like Cam Davidson-Pilon's Probabilistic Programming and Bayesian Methods for Hackers. + +Some more modules to research: + - text analysis and natural language processing: nltk, http://www.nltk.org + - social network analysis: igraph, http://igraph.org/python/ -- cgit v1.2.3 From bde8645cc7bb7f0a88b5d106cd0bd0b7e40886d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Migda=C5=82?= Date: Sun, 3 Jan 2016 19:45:54 +0100 Subject: pep8 fixes (spaces and multiline statements) in Python readability and code style matters --- pythonstatcomp.html.markdown | 103 ++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 46 deletions(-) (limited to 'pythonstatcomp.html.markdown') diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown index 78b62e33..f8d83b98 100644 --- a/pythonstatcomp.html.markdown +++ b/pythonstatcomp.html.markdown @@ -9,6 +9,8 @@ This is a tutorial on how to do some typical statistical programming tasks using ```python + + # 0. Getting set up ==== """ Get set up with IPython and pip install the following: numpy, scipy, pandas, @@ -25,17 +27,17 @@ This is a tutorial on how to do some typical statistical programming tasks using already using Python, there's a benefit to sticking with one language. """ -import requests # for HTTP requests (web scraping, APIs) +import requests # for HTTP requests (web scraping, APIs) import os # web scraping r = requests.get("https://github.com/adambard/learnxinyminutes-docs") -r.status_code # if 200, request was successful -r.text # raw page source -print(r.text) # prettily formatted +r.status_code # if 200, request was successful +r.text # raw page source +print(r.text) # prettily formatted # save the page source in a file: -os.getcwd() # check what's the working directory -f = open("learnxinyminutes.html","wb") +os.getcwd() # check what's the working directory +f = open("learnxinyminutes.html", "wb") f.write(r.text.encode("UTF-8")) f.close() @@ -44,7 +46,7 @@ fp = "https://raw.githubusercontent.com/adambard/learnxinyminutes-docs/master/" fn = "pets.csv" r = requests.get(fp + fn) print(r.text) -f = open(fn,"wb") +f = open(fn, "wb") f.write(r.text.encode("UTF-8")) f.close() @@ -58,7 +60,9 @@ f.close() you've used R, you will be familiar with the idea of the "data.frame" already. """ -import pandas as pd, numpy as np, scipy as sp +import pandas as pd +import numpy as np +import scipy as sp pets = pd.read_csv(fn) pets # name age weight species @@ -74,20 +78,20 @@ pets pets.age pets["age"] -pets.head(2) # prints first 2 rows -pets.tail(1) # prints last row +pets.head(2) # prints first 2 rows +pets.tail(1) # prints last row -pets.name[1] # 'vesuvius' -pets.species[0] # 'cat' -pets["weight"][2] # 34 +pets.name[1] # 'vesuvius' +pets.species[0] # 'cat' +pets["weight"][2] # 34 # in R, you would expect to get 3 rows doing this, but here you get 2: pets.age[0:2] # 0 3 # 1 6 -sum(pets.age)*2 # 28 -max(pets.weight) - min(pets.weight) # 20 +sum(pets.age) * 2 # 28 +max(pets.weight) - min(pets.weight) # 20 """ If you are doing some serious linear algebra and number-crunching, you may just want arrays, not DataFrames. DataFrames are ideal for combining columns @@ -96,7 +100,8 @@ max(pets.weight) - min(pets.weight) # 20 # 3. Charts ==== -import matplotlib as mpl, matplotlib.pyplot as plt +import matplotlib as mpl +import matplotlib.pyplot as plt %matplotlib inline # To do data vizualization in Python, use matplotlib @@ -105,13 +110,17 @@ plt.hist(pets.age); plt.boxplot(pets.weight); -plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight"); +plt.scatter(pets.age, pets.weight) +plt.xlabel("age") +plt.ylabel("weight"); # seaborn sits atop matplotlib and makes plots prettier import seaborn as sns -plt.scatter(pets.age, pets.weight); plt.xlabel("age"); plt.ylabel("weight"); +plt.scatter(pets.age, pets.weight) +plt.xlabel("age") +plt.ylabel("weight"); # there are also some seaborn-specific plotting functions # notice how seaborn automatically labels the x-axis on this barplot @@ -141,7 +150,7 @@ ggplot(aes(x="age",y="weight"), data=pets) + geom_point() + labs(title="pets") url = "https://raw.githubusercontent.com/e99n09/R-notes/master/data/hre.csv" r = requests.get(url) fp = "hre.csv" -f = open(fp,"wb") +f = open(fp, "wb") f.write(r.text.encode("UTF-8")) f.close() @@ -149,33 +158,33 @@ hre = pd.read_csv(fp) hre.head() """ - Ix Dynasty Name Birth Death Election 1 -0 NaN Carolingian Charles I 2 April 742 28 January 814 NaN -1 NaN Carolingian Louis I 778 20 June 840 NaN -2 NaN Carolingian Lothair I 795 29 September 855 NaN -3 NaN Carolingian Louis II 825 12 August 875 NaN -4 NaN Carolingian Charles II 13 June 823 6 October 877 NaN - - Election 2 Coronation 1 Coronation 2 Ceased to be Emperor -0 NaN 25 December 800 NaN 28 January 814 -1 NaN 11 September 813 5 October 816 20 June 840 -2 NaN 5 April 823 NaN 29 September 855 -3 NaN Easter 850 18 May 872 12 August 875 -4 NaN 29 December 875 NaN 6 October 877 - - Descent from whom 1 Descent how 1 Descent from whom 2 Descent how 2 -0 NaN NaN NaN NaN -1 Charles I son NaN NaN -2 Louis I son NaN NaN -3 Lothair I son NaN NaN -4 Louis I son NaN NaN + Ix Dynasty Name Birth Death Election 1 +0 NaN Carolingian Charles I 2 April 742 28 January 814 NaN +1 NaN Carolingian Louis I 778 20 June 840 NaN +2 NaN Carolingian Lothair I 795 29 September 855 NaN +3 NaN Carolingian Louis II 825 12 August 875 NaN +4 NaN Carolingian Charles II 13 June 823 6 October 877 NaN + + Election 2 Coronation 1 Coronation 2 Ceased to be Emperor +0 NaN 25 December 800 NaN 28 January 814 +1 NaN 11 September 813 5 October 816 20 June 840 +2 NaN 5 April 823 NaN 29 September 855 +3 NaN Easter 850 18 May 872 12 August 875 +4 NaN 29 December 875 NaN 6 October 877 + + Descent from whom 1 Descent how 1 Descent from whom 2 Descent how 2 +0 NaN NaN NaN NaN +1 Charles I son NaN NaN +2 Louis I son NaN NaN +3 Lothair I son NaN NaN +4 Louis I son NaN NaN """ # clean the Birth and Death columns -import re # module for regular expressions +import re # module for regular expressions -rx = re.compile(r'\d+$') # match trailing digits +rx = re.compile(r'\d+$') # match trailing digits """ This function applies the regular expression to an input column (here Birth, Death), flattens the resulting list, converts it to a Series object, and @@ -185,8 +194,9 @@ rx = re.compile(r'\d+$') # match trailing digits - http://stackoverflow.com/questions/11860476/how-to-unlist-a-python-list - http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html """ + def extractYear(v): - return(pd.Series(reduce(lambda x,y: x+y,map(rx.findall,v),[])).astype(int)) + return(pd.Series(reduce(lambda x, y: x + y, map(rx.findall, v), [])).astype(int)) hre["BirthY"] = extractYear(hre.Birth) hre["DeathY"] = extractYear(hre.Death) @@ -199,17 +209,17 @@ sns.lmplot("BirthY", "EstAge", data=hre, hue="Dynasty", fit_reg=False); # use scipy to run a linear regression from scipy import stats -(slope,intercept,rval,pval,stderr)=stats.linregress(hre.BirthY,hre.EstAge) +(slope, intercept, rval, pval, stderr) = stats.linregress(hre.BirthY, hre.EstAge) # code source: http://wiki.scipy.org/Cookbook/LinearRegression # check the slope -slope # 0.0057672618839073328 +slope # 0.0057672618839073328 # check the R^2 value: -rval**2 # 0.020363950027333586 +rval**2 # 0.020363950027333586 # check the p-value -pval # 0.34971812581498452 +pval # 0.34971812581498452 # use seaborn to make a scatterplot and plot the linear regression trend line sns.lmplot("BirthY", "EstAge", data=hre); @@ -223,6 +233,7 @@ sns.lmplot("BirthY", "EstAge", data=hre); To see a version of the Holy Roman Emperors analysis using R, see - http://github.com/e99n09/R-notes/blob/master/holy_roman_emperors_dates.R """ + ``` If you want to learn more, get _Python for Data Analysis_ by Wes McKinney. It's a superb resource and I used it as a reference when writing this tutorial. -- cgit v1.2.3 From ecba01b4b1c35cbcf58c5e030b9e79a5c30be55c Mon Sep 17 00:00:00 2001 From: Braxton Fair Date: Sun, 28 Feb 2016 20:33:57 -0600 Subject: Write a bit better code --- pythonstatcomp.html.markdown | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'pythonstatcomp.html.markdown') diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown index f8d83b98..0b02dca8 100644 --- a/pythonstatcomp.html.markdown +++ b/pythonstatcomp.html.markdown @@ -150,9 +150,8 @@ ggplot(aes(x="age",y="weight"), data=pets) + geom_point() + labs(title="pets") url = "https://raw.githubusercontent.com/e99n09/R-notes/master/data/hre.csv" r = requests.get(url) fp = "hre.csv" -f = open(fp, "wb") -f.write(r.text.encode("UTF-8")) -f.close() +with open(fp, "wb") as f: + f.write(r.text.encode("UTF-8")) hre = pd.read_csv(fp) -- cgit v1.2.3 From 90f68920b97ec6384933de323e8d7f260d3ae944 Mon Sep 17 00:00:00 2001 From: Jason Kinetec Date: Fri, 18 Mar 2016 12:29:29 -0400 Subject: Update pythonstatcomp.html.markdown --- pythonstatcomp.html.markdown | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'pythonstatcomp.html.markdown') diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown index 0b02dca8..7cf6862e 100644 --- a/pythonstatcomp.html.markdown +++ b/pythonstatcomp.html.markdown @@ -1,8 +1,9 @@ --- -language: Statistical computing with Python +category: tool +tool: Statistical Computing with Python contributors: - ["e99n09", "https://github.com/e99n09"] -filename: pythonstatcomp.py + - ["acidflip", "https://github.com/acidflip"] --- This is a tutorial on how to do some typical statistical programming tasks using Python. It's intended for people basically familiar with Python and experienced at statistical programming in a language like R, Stata, SAS, SPSS, or MATLAB. -- cgit v1.2.3 From 12f0b9442a680e27ec23504e5f43e13143c121b4 Mon Sep 17 00:00:00 2001 From: Jason Kinetec Date: Fri, 18 Mar 2016 14:07:03 -0400 Subject: Update pythonstatcomp.html.markdown --- pythonstatcomp.html.markdown | 1 - 1 file changed, 1 deletion(-) (limited to 'pythonstatcomp.html.markdown') diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown index 7cf6862e..8ee3aa64 100644 --- a/pythonstatcomp.html.markdown +++ b/pythonstatcomp.html.markdown @@ -3,7 +3,6 @@ category: tool tool: Statistical Computing with Python contributors: - ["e99n09", "https://github.com/e99n09"] - - ["acidflip", "https://github.com/acidflip"] --- This is a tutorial on how to do some typical statistical programming tasks using Python. It's intended for people basically familiar with Python and experienced at statistical programming in a language like R, Stata, SAS, SPSS, or MATLAB. -- cgit v1.2.3 From 985d23a52b76593a120adff5381c2df3a80fe298 Mon Sep 17 00:00:00 2001 From: HairyFotr Date: Wed, 23 Aug 2017 10:14:39 +0200 Subject: Fix a bunch of typos --- pythonstatcomp.html.markdown | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pythonstatcomp.html.markdown') diff --git a/pythonstatcomp.html.markdown b/pythonstatcomp.html.markdown index 8ee3aa64..79bbcd8d 100644 --- a/pythonstatcomp.html.markdown +++ b/pythonstatcomp.html.markdown @@ -104,7 +104,7 @@ import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline -# To do data vizualization in Python, use matplotlib +# To do data visualization in Python, use matplotlib plt.hist(pets.age); -- cgit v1.2.3