From c370888b800e2912ac6e0a0120553e2fe2c90e65 Mon Sep 17 00:00:00 2001 From: Rhiannon Walker Date: Wed, 1 Feb 2017 14:50:39 -0600 Subject: [PATCH] added 5min append State exporing the usaddress package --- .../5min append State-checkpoint.ipynb | 999 ++++++++++++++++++ 5min append State.ipynb | 999 ++++++++++++++++++ 2 files changed, 1998 insertions(+) create mode 100644 .ipynb_checkpoints/5min append State-checkpoint.ipynb create mode 100644 5min append State.ipynb diff --git a/.ipynb_checkpoints/5min append State-checkpoint.ipynb b/.ipynb_checkpoints/5min append State-checkpoint.ipynb new file mode 100644 index 0000000..d6d4d03 --- /dev/null +++ b/.ipynb_checkpoints/5min append State-checkpoint.ipynb @@ -0,0 +1,999 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5 minute append to state column \n", + "\n", + "After taking some time to look around I found usaddress for parsing states. This notebook shows how easy it is to parse addresses with usaddress. I feel that the list of tuples is a bit clunky, but it works and I only spotted one error in the results... Indianna?? really??" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "import usaddress\n", + "% matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = pd.read_csv('https://query.data.world/s/78ou6jcu4jfseul53lu1w3nio')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
locationcrowd-low-estimatecrowd-high-estimatemean-high-lowsourceLatitudeLongitude
0Abilene, TX200200200.0http://www.reporternews.com/story/news/local/2...32.576489-99.665323
1Accident, MD545454.0Twitter; on-site witness39.628700-79.319760
2Adak, AK101010.0adn.com0.0000000.000000
3Adrian, MI130150140.0https://www.facebook.com/events/847360115406578/41.889943-84.065892
4Ajo, AZ250250250.0https://www.facebook.com/plugins/post.php?href...32.384890-112.890110
\n", + "
" + ], + "text/plain": [ + " location crowd-low-estimate crowd-high-estimate mean-high-low \\\n", + "0 Abilene, TX 200 200 200.0 \n", + "1 Accident, MD 54 54 54.0 \n", + "2 Adak, AK 10 10 10.0 \n", + "3 Adrian, MI 130 150 140.0 \n", + "4 Ajo, AZ 250 250 250.0 \n", + "\n", + " source Latitude Longitude \n", + "0 http://www.reporternews.com/story/news/local/2... 32.576489 -99.665323 \n", + "1 Twitter; on-site witness 39.628700 -79.319760 \n", + "2 adn.com 0.000000 0.000000 \n", + "3 https://www.facebook.com/events/847360115406578/ 41.889943 -84.065892 \n", + "4 https://www.facebook.com/plugins/post.php?href... 32.384890 -112.890110 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## original method\n", + "using last 2 characters of location" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df['State'] = df['location'].str[-2:]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
location
State
CA69
WA27
NY25
OR24
AK22
\n", + "
" + ], + "text/plain": [ + " location\n", + "State \n", + "CA 69\n", + "WA 27\n", + "NY 25\n", + "OR 24\n", + "AK 22" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('State').count().sort_values('location', ascending=False)[['location']].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks Like there are a few errors in State that would need cleansed (at least 12)\n", + "\n", + "15 minutes is not enough to fix this by hand, I know there is a package that would do this faster that has been mentioned on talk python to me. Ill have to look into this package another day" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "62" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df.groupby('State').count())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
location
State
DE2
OK2
LA2
DC1
RI1
ah1
er1
es1
ge1
le1
na1
nd1
on1
t)1
te1
\n", + "
" + ], + "text/plain": [ + " location\n", + "State \n", + "DE 2\n", + "OK 2\n", + "LA 2\n", + "DC 1\n", + "RI 1\n", + "ah 1\n", + "er 1\n", + "es 1\n", + "ge 1\n", + "le 1\n", + "na 1\n", + "nd 1\n", + "on 1\n", + "t) 1\n", + "te 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('State').count().sort_values('location', ascending=False)[['location']].tail(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usaddress Method" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df['Address'] = df['location'].apply(usaddress.parse)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10 [(Albuquerque,, PlaceName), (NM, StateName)]\n", + "11 [(Almanor, PlaceName), (West,, PlaceName), (CA...\n", + "12 [(Alpine,, PlaceName), (TX, StateName)]\n", + "13 [(Amarillo,, PlaceName), (TX, StateName)]\n", + "14 [(Amelia, StreetName), (Island,, StreetNamePos...\n", + "15 [(Anacortes,, PlaceName), (WA, StateName)]\n", + "16 [(Anchorage,, PlaceName), (AK, StateName)]\n", + "17 [(Ann, PlaceName), (Arbor,, PlaceName), (MI, S...\n", + "18 [(Annapolis,, PlaceName), (MD, StateName)]\n", + "19 [(Arlington,, PlaceName), (VA, StateName)]\n", + "Name: Address, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Address'].iloc[10:20]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345
10(Albuquerque,, PlaceName)(NM, StateName)NoneNoneNoneNone
11(Almanor, PlaceName)(West,, PlaceName)(CA, StateName)NoneNoneNone
12(Alpine,, PlaceName)(TX, StateName)NoneNoneNoneNone
13(Amarillo,, PlaceName)(TX, StateName)NoneNoneNoneNone
14(Amelia, StreetName)(Island,, StreetNamePostType)(FL, OccupancyType)NoneNoneNone
15(Anacortes,, PlaceName)(WA, StateName)NoneNoneNoneNone
16(Anchorage,, PlaceName)(AK, StateName)NoneNoneNoneNone
17(Ann, PlaceName)(Arbor,, PlaceName)(MI, StateName)NoneNoneNone
18(Annapolis,, PlaceName)(MD, StateName)NoneNoneNoneNone
19(Arlington,, PlaceName)(VA, StateName)NoneNoneNoneNone
\n", + "
" + ], + "text/plain": [ + " 0 1 \\\n", + "10 (Albuquerque,, PlaceName) (NM, StateName) \n", + "11 (Almanor, PlaceName) (West,, PlaceName) \n", + "12 (Alpine,, PlaceName) (TX, StateName) \n", + "13 (Amarillo,, PlaceName) (TX, StateName) \n", + "14 (Amelia, StreetName) (Island,, StreetNamePostType) \n", + "15 (Anacortes,, PlaceName) (WA, StateName) \n", + "16 (Anchorage,, PlaceName) (AK, StateName) \n", + "17 (Ann, PlaceName) (Arbor,, PlaceName) \n", + "18 (Annapolis,, PlaceName) (MD, StateName) \n", + "19 (Arlington,, PlaceName) (VA, StateName) \n", + "\n", + " 2 3 4 5 \n", + "10 None None None None \n", + "11 (CA, StateName) None None None \n", + "12 None None None None \n", + "13 None None None None \n", + "14 (FL, OccupancyType) None None None \n", + "15 None None None None \n", + "16 None None None None \n", + "17 (MI, StateName) None None None \n", + "18 None None None None \n", + "19 None None None None " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame.from_records(df['Address'].values.tolist()).iloc[10:20]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_state(lst):\n", + " for tpl in lst:\n", + " if tpl[1] == 'StateName':\n", + " return tpl[0]\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df['State'] = df['Address'].apply(get_state)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "53" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df.groupby('State').count())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Errors? 53?\n", + "Looks like there were 53 states it counted PR (Puerto Rico), and DC which is good. But it also included Indianna?? not sure why, but I am impressed with the reults." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
location
State
CA70
WA27
NY25
OR24
AK22
MI20
TX20
PA19
CO19
WI16
NC15
FL14
VA13
NM11
OH11
AZ11
ME11
NJ10
MN10
MA10
IL9
UT8
IN8
ID8
MD7
TN7
NH7
IA6
GA6
HI5
CT5
SD4
SC4
PR4
MS4
MO4
AR4
NV3
VT3
NE3
KY3
MT3
ND3
WY3
AL2
LA2
KS2
DE2
OK2
WV2
RI1
Indiana1
DC1
\n", + "
" + ], + "text/plain": [ + " location\n", + "State \n", + "CA 70\n", + "WA 27\n", + "NY 25\n", + "OR 24\n", + "AK 22\n", + "MI 20\n", + "TX 20\n", + "PA 19\n", + "CO 19\n", + "WI 16\n", + "NC 15\n", + "FL 14\n", + "VA 13\n", + "NM 11\n", + "OH 11\n", + "AZ 11\n", + "ME 11\n", + "NJ 10\n", + "MN 10\n", + "MA 10\n", + "IL 9\n", + "UT 8\n", + "IN 8\n", + "ID 8\n", + "MD 7\n", + "TN 7\n", + "NH 7\n", + "IA 6\n", + "GA 6\n", + "HI 5\n", + "CT 5\n", + "SD 4\n", + "SC 4\n", + "PR 4\n", + "MS 4\n", + "MO 4\n", + "AR 4\n", + "NV 3\n", + "VT 3\n", + "NE 3\n", + "KY 3\n", + "MT 3\n", + "ND 3\n", + "WY 3\n", + "AL 2\n", + "LA 2\n", + "KS 2\n", + "DE 2\n", + "OK 2\n", + "WV 2\n", + "RI 1\n", + "Indiana 1\n", + "DC 1" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('State').count().sort_values('location', ascending=False)[['location']]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Amarillo,', 'PlaceName'), ('TX', 'StateName')]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Address'].values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda root]", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/5min append State.ipynb b/5min append State.ipynb new file mode 100644 index 0000000..d6d4d03 --- /dev/null +++ b/5min append State.ipynb @@ -0,0 +1,999 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5 minute append to state column \n", + "\n", + "After taking some time to look around I found usaddress for parsing states. This notebook shows how easy it is to parse addresses with usaddress. I feel that the list of tuples is a bit clunky, but it works and I only spotted one error in the results... Indianna?? really??" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "import usaddress\n", + "% matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = pd.read_csv('https://query.data.world/s/78ou6jcu4jfseul53lu1w3nio')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
locationcrowd-low-estimatecrowd-high-estimatemean-high-lowsourceLatitudeLongitude
0Abilene, TX200200200.0http://www.reporternews.com/story/news/local/2...32.576489-99.665323
1Accident, MD545454.0Twitter; on-site witness39.628700-79.319760
2Adak, AK101010.0adn.com0.0000000.000000
3Adrian, MI130150140.0https://www.facebook.com/events/847360115406578/41.889943-84.065892
4Ajo, AZ250250250.0https://www.facebook.com/plugins/post.php?href...32.384890-112.890110
\n", + "
" + ], + "text/plain": [ + " location crowd-low-estimate crowd-high-estimate mean-high-low \\\n", + "0 Abilene, TX 200 200 200.0 \n", + "1 Accident, MD 54 54 54.0 \n", + "2 Adak, AK 10 10 10.0 \n", + "3 Adrian, MI 130 150 140.0 \n", + "4 Ajo, AZ 250 250 250.0 \n", + "\n", + " source Latitude Longitude \n", + "0 http://www.reporternews.com/story/news/local/2... 32.576489 -99.665323 \n", + "1 Twitter; on-site witness 39.628700 -79.319760 \n", + "2 adn.com 0.000000 0.000000 \n", + "3 https://www.facebook.com/events/847360115406578/ 41.889943 -84.065892 \n", + "4 https://www.facebook.com/plugins/post.php?href... 32.384890 -112.890110 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## original method\n", + "using last 2 characters of location" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df['State'] = df['location'].str[-2:]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
location
State
CA69
WA27
NY25
OR24
AK22
\n", + "
" + ], + "text/plain": [ + " location\n", + "State \n", + "CA 69\n", + "WA 27\n", + "NY 25\n", + "OR 24\n", + "AK 22" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('State').count().sort_values('location', ascending=False)[['location']].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks Like there are a few errors in State that would need cleansed (at least 12)\n", + "\n", + "15 minutes is not enough to fix this by hand, I know there is a package that would do this faster that has been mentioned on talk python to me. Ill have to look into this package another day" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "62" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df.groupby('State').count())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
location
State
DE2
OK2
LA2
DC1
RI1
ah1
er1
es1
ge1
le1
na1
nd1
on1
t)1
te1
\n", + "
" + ], + "text/plain": [ + " location\n", + "State \n", + "DE 2\n", + "OK 2\n", + "LA 2\n", + "DC 1\n", + "RI 1\n", + "ah 1\n", + "er 1\n", + "es 1\n", + "ge 1\n", + "le 1\n", + "na 1\n", + "nd 1\n", + "on 1\n", + "t) 1\n", + "te 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('State').count().sort_values('location', ascending=False)[['location']].tail(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usaddress Method" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df['Address'] = df['location'].apply(usaddress.parse)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10 [(Albuquerque,, PlaceName), (NM, StateName)]\n", + "11 [(Almanor, PlaceName), (West,, PlaceName), (CA...\n", + "12 [(Alpine,, PlaceName), (TX, StateName)]\n", + "13 [(Amarillo,, PlaceName), (TX, StateName)]\n", + "14 [(Amelia, StreetName), (Island,, StreetNamePos...\n", + "15 [(Anacortes,, PlaceName), (WA, StateName)]\n", + "16 [(Anchorage,, PlaceName), (AK, StateName)]\n", + "17 [(Ann, PlaceName), (Arbor,, PlaceName), (MI, S...\n", + "18 [(Annapolis,, PlaceName), (MD, StateName)]\n", + "19 [(Arlington,, PlaceName), (VA, StateName)]\n", + "Name: Address, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Address'].iloc[10:20]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345
10(Albuquerque,, PlaceName)(NM, StateName)NoneNoneNoneNone
11(Almanor, PlaceName)(West,, PlaceName)(CA, StateName)NoneNoneNone
12(Alpine,, PlaceName)(TX, StateName)NoneNoneNoneNone
13(Amarillo,, PlaceName)(TX, StateName)NoneNoneNoneNone
14(Amelia, StreetName)(Island,, StreetNamePostType)(FL, OccupancyType)NoneNoneNone
15(Anacortes,, PlaceName)(WA, StateName)NoneNoneNoneNone
16(Anchorage,, PlaceName)(AK, StateName)NoneNoneNoneNone
17(Ann, PlaceName)(Arbor,, PlaceName)(MI, StateName)NoneNoneNone
18(Annapolis,, PlaceName)(MD, StateName)NoneNoneNoneNone
19(Arlington,, PlaceName)(VA, StateName)NoneNoneNoneNone
\n", + "
" + ], + "text/plain": [ + " 0 1 \\\n", + "10 (Albuquerque,, PlaceName) (NM, StateName) \n", + "11 (Almanor, PlaceName) (West,, PlaceName) \n", + "12 (Alpine,, PlaceName) (TX, StateName) \n", + "13 (Amarillo,, PlaceName) (TX, StateName) \n", + "14 (Amelia, StreetName) (Island,, StreetNamePostType) \n", + "15 (Anacortes,, PlaceName) (WA, StateName) \n", + "16 (Anchorage,, PlaceName) (AK, StateName) \n", + "17 (Ann, PlaceName) (Arbor,, PlaceName) \n", + "18 (Annapolis,, PlaceName) (MD, StateName) \n", + "19 (Arlington,, PlaceName) (VA, StateName) \n", + "\n", + " 2 3 4 5 \n", + "10 None None None None \n", + "11 (CA, StateName) None None None \n", + "12 None None None None \n", + "13 None None None None \n", + "14 (FL, OccupancyType) None None None \n", + "15 None None None None \n", + "16 None None None None \n", + "17 (MI, StateName) None None None \n", + "18 None None None None \n", + "19 None None None None " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame.from_records(df['Address'].values.tolist()).iloc[10:20]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_state(lst):\n", + " for tpl in lst:\n", + " if tpl[1] == 'StateName':\n", + " return tpl[0]\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df['State'] = df['Address'].apply(get_state)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "53" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df.groupby('State').count())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Errors? 53?\n", + "Looks like there were 53 states it counted PR (Puerto Rico), and DC which is good. But it also included Indianna?? not sure why, but I am impressed with the reults." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
location
State
CA70
WA27
NY25
OR24
AK22
MI20
TX20
PA19
CO19
WI16
NC15
FL14
VA13
NM11
OH11
AZ11
ME11
NJ10
MN10
MA10
IL9
UT8
IN8
ID8
MD7
TN7
NH7
IA6
GA6
HI5
CT5
SD4
SC4
PR4
MS4
MO4
AR4
NV3
VT3
NE3
KY3
MT3
ND3
WY3
AL2
LA2
KS2
DE2
OK2
WV2
RI1
Indiana1
DC1
\n", + "
" + ], + "text/plain": [ + " location\n", + "State \n", + "CA 70\n", + "WA 27\n", + "NY 25\n", + "OR 24\n", + "AK 22\n", + "MI 20\n", + "TX 20\n", + "PA 19\n", + "CO 19\n", + "WI 16\n", + "NC 15\n", + "FL 14\n", + "VA 13\n", + "NM 11\n", + "OH 11\n", + "AZ 11\n", + "ME 11\n", + "NJ 10\n", + "MN 10\n", + "MA 10\n", + "IL 9\n", + "UT 8\n", + "IN 8\n", + "ID 8\n", + "MD 7\n", + "TN 7\n", + "NH 7\n", + "IA 6\n", + "GA 6\n", + "HI 5\n", + "CT 5\n", + "SD 4\n", + "SC 4\n", + "PR 4\n", + "MS 4\n", + "MO 4\n", + "AR 4\n", + "NV 3\n", + "VT 3\n", + "NE 3\n", + "KY 3\n", + "MT 3\n", + "ND 3\n", + "WY 3\n", + "AL 2\n", + "LA 2\n", + "KS 2\n", + "DE 2\n", + "OK 2\n", + "WV 2\n", + "RI 1\n", + "Indiana 1\n", + "DC 1" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('State').count().sort_values('location', ascending=False)[['location']]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Amarillo,', 'PlaceName'), ('TX', 'StateName')]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Address'].values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda root]", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}