added 5min append State exporing the usaddress package

2017-02-01 14:50:39 -06:00 · 2017-02-01 14:50:39 -06:00 · c370888b80
commit c370888b80
parent 680e352bcd
2 changed files with 1998 additions and 0 deletions
--- a/.ipynb_checkpoints/5min
+++ b/.ipynb_checkpoints/5min
@ -0,0 +1,999 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 5 minute append to state column \n",
+    "\n",
+    "After taking some time to look around I found usaddress for parsing states.  This notebook shows how easy it is to parse addresses with usaddress.  I feel that the list of tuples is a bit clunky, but it works and I only spotted one error in the results... Indianna??  really??"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import usaddress\n",
+    "% matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('https://query.data.world/s/78ou6jcu4jfseul53lu1w3nio')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "      <th>crowd-low-estimate</th>\n",
+       "      <th>crowd-high-estimate</th>\n",
+       "      <th>mean-high-low</th>\n",
+       "      <th>source</th>\n",
+       "      <th>Latitude</th>\n",
+       "      <th>Longitude</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Abilene, TX</td>\n",
+       "      <td>200</td>\n",
+       "      <td>200</td>\n",
+       "      <td>200.0</td>\n",
+       "      <td>http://www.reporternews.com/story/news/local/2...</td>\n",
+       "      <td>32.576489</td>\n",
+       "      <td>-99.665323</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Accident, MD</td>\n",
+       "      <td>54</td>\n",
+       "      <td>54</td>\n",
+       "      <td>54.0</td>\n",
+       "      <td>Twitter; on-site witness</td>\n",
+       "      <td>39.628700</td>\n",
+       "      <td>-79.319760</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Adak, AK</td>\n",
+       "      <td>10</td>\n",
+       "      <td>10</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>adn.com</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Adrian, MI</td>\n",
+       "      <td>130</td>\n",
+       "      <td>150</td>\n",
+       "      <td>140.0</td>\n",
+       "      <td>https://www.facebook.com/events/847360115406578/</td>\n",
+       "      <td>41.889943</td>\n",
+       "      <td>-84.065892</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Ajo, AZ</td>\n",
+       "      <td>250</td>\n",
+       "      <td>250</td>\n",
+       "      <td>250.0</td>\n",
+       "      <td>https://www.facebook.com/plugins/post.php?href...</td>\n",
+       "      <td>32.384890</td>\n",
+       "      <td>-112.890110</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       location crowd-low-estimate crowd-high-estimate  mean-high-low  \\\n",
+       "0   Abilene, TX                200                 200          200.0   \n",
+       "1  Accident, MD                 54                  54           54.0   \n",
+       "2      Adak, AK                 10                  10           10.0   \n",
+       "3    Adrian, MI                130                 150          140.0   \n",
+       "4       Ajo, AZ                250                 250          250.0   \n",
+       "\n",
+       "                                              source   Latitude   Longitude  \n",
+       "0  http://www.reporternews.com/story/news/local/2...  32.576489  -99.665323  \n",
+       "1                          Twitter; on-site witness   39.628700  -79.319760  \n",
+       "2                                            adn.com   0.000000    0.000000  \n",
+       "3   https://www.facebook.com/events/847360115406578/  41.889943  -84.065892  \n",
+       "4  https://www.facebook.com/plugins/post.php?href...  32.384890 -112.890110  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## original method\n",
+    "using last 2 characters of location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df['State'] = df['location'].str[-2:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>State</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>CA</th>\n",
+       "      <td>69</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WA</th>\n",
+       "      <td>27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NY</th>\n",
+       "      <td>25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OR</th>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AK</th>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       location\n",
+       "State          \n",
+       "CA           69\n",
+       "WA           27\n",
+       "NY           25\n",
+       "OR           24\n",
+       "AK           22"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('State').count().sort_values('location', ascending=False)[['location']].head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Looks Like there are a few errors in State that would need cleansed (at least 12)\n",
+    "\n",
+    "15 minutes is not enough to fix this by hand, I know there is a package that would do this faster that has been mentioned on talk python to me.  Ill have to look into this package another day"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "62"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df.groupby('State').count())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>State</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>DE</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OK</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>LA</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>DC</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RI</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ah</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>er</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>es</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ge</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>le</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>na</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>nd</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>on</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>t)</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>te</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       location\n",
+       "State          \n",
+       "DE            2\n",
+       "OK            2\n",
+       "LA            2\n",
+       "DC            1\n",
+       "RI            1\n",
+       "ah            1\n",
+       "er            1\n",
+       "es            1\n",
+       "ge            1\n",
+       "le            1\n",
+       "na            1\n",
+       "nd            1\n",
+       "on            1\n",
+       "t)            1\n",
+       "te            1"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('State').count().sort_values('location', ascending=False)[['location']].tail(15)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usaddress Method"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df['Address'] = df['location'].apply(usaddress.parse)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "10         [(Albuquerque,, PlaceName), (NM, StateName)]\n",
+       "11    [(Almanor, PlaceName), (West,, PlaceName), (CA...\n",
+       "12              [(Alpine,, PlaceName), (TX, StateName)]\n",
+       "13            [(Amarillo,, PlaceName), (TX, StateName)]\n",
+       "14    [(Amelia, StreetName), (Island,, StreetNamePos...\n",
+       "15           [(Anacortes,, PlaceName), (WA, StateName)]\n",
+       "16           [(Anchorage,, PlaceName), (AK, StateName)]\n",
+       "17    [(Ann, PlaceName), (Arbor,, PlaceName), (MI, S...\n",
+       "18           [(Annapolis,, PlaceName), (MD, StateName)]\n",
+       "19           [(Arlington,, PlaceName), (VA, StateName)]\n",
+       "Name: Address, dtype: object"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Address'].iloc[10:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>(Albuquerque,, PlaceName)</td>\n",
+       "      <td>(NM, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>(Almanor, PlaceName)</td>\n",
+       "      <td>(West,, PlaceName)</td>\n",
+       "      <td>(CA, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>(Alpine,, PlaceName)</td>\n",
+       "      <td>(TX, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>(Amarillo,, PlaceName)</td>\n",
+       "      <td>(TX, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>(Amelia, StreetName)</td>\n",
+       "      <td>(Island,, StreetNamePostType)</td>\n",
+       "      <td>(FL, OccupancyType)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>(Anacortes,, PlaceName)</td>\n",
+       "      <td>(WA, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>(Anchorage,, PlaceName)</td>\n",
+       "      <td>(AK, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>(Ann, PlaceName)</td>\n",
+       "      <td>(Arbor,, PlaceName)</td>\n",
+       "      <td>(MI, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>(Annapolis,, PlaceName)</td>\n",
+       "      <td>(MD, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>(Arlington,, PlaceName)</td>\n",
+       "      <td>(VA, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                            0                              1  \\\n",
+       "10  (Albuquerque,, PlaceName)                (NM, StateName)   \n",
+       "11       (Almanor, PlaceName)             (West,, PlaceName)   \n",
+       "12       (Alpine,, PlaceName)                (TX, StateName)   \n",
+       "13     (Amarillo,, PlaceName)                (TX, StateName)   \n",
+       "14       (Amelia, StreetName)  (Island,, StreetNamePostType)   \n",
+       "15    (Anacortes,, PlaceName)                (WA, StateName)   \n",
+       "16    (Anchorage,, PlaceName)                (AK, StateName)   \n",
+       "17           (Ann, PlaceName)            (Arbor,, PlaceName)   \n",
+       "18    (Annapolis,, PlaceName)                (MD, StateName)   \n",
+       "19    (Arlington,, PlaceName)                (VA, StateName)   \n",
+       "\n",
+       "                      2     3     4     5  \n",
+       "10                 None  None  None  None  \n",
+       "11      (CA, StateName)  None  None  None  \n",
+       "12                 None  None  None  None  \n",
+       "13                 None  None  None  None  \n",
+       "14  (FL, OccupancyType)  None  None  None  \n",
+       "15                 None  None  None  None  \n",
+       "16                 None  None  None  None  \n",
+       "17      (MI, StateName)  None  None  None  \n",
+       "18                 None  None  None  None  \n",
+       "19                 None  None  None  None  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame.from_records(df['Address'].values.tolist()).iloc[10:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def get_state(lst):\n",
+    "    for tpl in lst:\n",
+    "        if tpl[1] == 'StateName':\n",
+    "            return tpl[0]\n",
+    "    return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df['State'] = df['Address'].apply(get_state)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "53"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df.groupby('State').count())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Errors? 53?\n",
+    "Looks like there were 53 states it counted PR (Puerto Rico), and DC which is good. But it also included Indianna?? not sure why, but I am impressed with the reults."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>State</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>CA</th>\n",
+       "      <td>70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WA</th>\n",
+       "      <td>27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NY</th>\n",
+       "      <td>25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OR</th>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AK</th>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MI</th>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TX</th>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PA</th>\n",
+       "      <td>19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CO</th>\n",
+       "      <td>19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WI</th>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NC</th>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>FL</th>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>VA</th>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NM</th>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OH</th>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AZ</th>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ME</th>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NJ</th>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MN</th>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MA</th>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>IL</th>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>UT</th>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>IN</th>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MD</th>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TN</th>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NH</th>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>IA</th>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GA</th>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>HI</th>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CT</th>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SD</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SC</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PR</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MS</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MO</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AR</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NV</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>VT</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NE</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KY</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MT</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ND</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WY</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AL</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>LA</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KS</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>DE</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OK</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WV</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RI</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Indiana</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>DC</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         location\n",
+       "State            \n",
+       "CA             70\n",
+       "WA             27\n",
+       "NY             25\n",
+       "OR             24\n",
+       "AK             22\n",
+       "MI             20\n",
+       "TX             20\n",
+       "PA             19\n",
+       "CO             19\n",
+       "WI             16\n",
+       "NC             15\n",
+       "FL             14\n",
+       "VA             13\n",
+       "NM             11\n",
+       "OH             11\n",
+       "AZ             11\n",
+       "ME             11\n",
+       "NJ             10\n",
+       "MN             10\n",
+       "MA             10\n",
+       "IL              9\n",
+       "UT              8\n",
+       "IN              8\n",
+       "ID              8\n",
+       "MD              7\n",
+       "TN              7\n",
+       "NH              7\n",
+       "IA              6\n",
+       "GA              6\n",
+       "HI              5\n",
+       "CT              5\n",
+       "SD              4\n",
+       "SC              4\n",
+       "PR              4\n",
+       "MS              4\n",
+       "MO              4\n",
+       "AR              4\n",
+       "NV              3\n",
+       "VT              3\n",
+       "NE              3\n",
+       "KY              3\n",
+       "MT              3\n",
+       "ND              3\n",
+       "WY              3\n",
+       "AL              2\n",
+       "LA              2\n",
+       "KS              2\n",
+       "DE              2\n",
+       "OK              2\n",
+       "WV              2\n",
+       "RI              1\n",
+       "Indiana         1\n",
+       "DC              1"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('State').count().sort_values('location', ascending=False)[['location']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Amarillo,', 'PlaceName'), ('TX', 'StateName')]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Address'].values.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [conda root]",
+   "language": "python",
+   "name": "conda-root-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/State.ipynb
+++ b/State.ipynb
@ -0,0 +1,999 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 5 minute append to state column \n",
+    "\n",
+    "After taking some time to look around I found usaddress for parsing states.  This notebook shows how easy it is to parse addresses with usaddress.  I feel that the list of tuples is a bit clunky, but it works and I only spotted one error in the results... Indianna??  really??"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import usaddress\n",
+    "% matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('https://query.data.world/s/78ou6jcu4jfseul53lu1w3nio')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "      <th>crowd-low-estimate</th>\n",
+       "      <th>crowd-high-estimate</th>\n",
+       "      <th>mean-high-low</th>\n",
+       "      <th>source</th>\n",
+       "      <th>Latitude</th>\n",
+       "      <th>Longitude</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Abilene, TX</td>\n",
+       "      <td>200</td>\n",
+       "      <td>200</td>\n",
+       "      <td>200.0</td>\n",
+       "      <td>http://www.reporternews.com/story/news/local/2...</td>\n",
+       "      <td>32.576489</td>\n",
+       "      <td>-99.665323</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Accident, MD</td>\n",
+       "      <td>54</td>\n",
+       "      <td>54</td>\n",
+       "      <td>54.0</td>\n",
+       "      <td>Twitter; on-site witness</td>\n",
+       "      <td>39.628700</td>\n",
+       "      <td>-79.319760</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Adak, AK</td>\n",
+       "      <td>10</td>\n",
+       "      <td>10</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>adn.com</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Adrian, MI</td>\n",
+       "      <td>130</td>\n",
+       "      <td>150</td>\n",
+       "      <td>140.0</td>\n",
+       "      <td>https://www.facebook.com/events/847360115406578/</td>\n",
+       "      <td>41.889943</td>\n",
+       "      <td>-84.065892</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Ajo, AZ</td>\n",
+       "      <td>250</td>\n",
+       "      <td>250</td>\n",
+       "      <td>250.0</td>\n",
+       "      <td>https://www.facebook.com/plugins/post.php?href...</td>\n",
+       "      <td>32.384890</td>\n",
+       "      <td>-112.890110</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       location crowd-low-estimate crowd-high-estimate  mean-high-low  \\\n",
+       "0   Abilene, TX                200                 200          200.0   \n",
+       "1  Accident, MD                 54                  54           54.0   \n",
+       "2      Adak, AK                 10                  10           10.0   \n",
+       "3    Adrian, MI                130                 150          140.0   \n",
+       "4       Ajo, AZ                250                 250          250.0   \n",
+       "\n",
+       "                                              source   Latitude   Longitude  \n",
+       "0  http://www.reporternews.com/story/news/local/2...  32.576489  -99.665323  \n",
+       "1                          Twitter; on-site witness   39.628700  -79.319760  \n",
+       "2                                            adn.com   0.000000    0.000000  \n",
+       "3   https://www.facebook.com/events/847360115406578/  41.889943  -84.065892  \n",
+       "4  https://www.facebook.com/plugins/post.php?href...  32.384890 -112.890110  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## original method\n",
+    "using last 2 characters of location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df['State'] = df['location'].str[-2:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>State</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>CA</th>\n",
+       "      <td>69</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WA</th>\n",
+       "      <td>27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NY</th>\n",
+       "      <td>25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OR</th>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AK</th>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       location\n",
+       "State          \n",
+       "CA           69\n",
+       "WA           27\n",
+       "NY           25\n",
+       "OR           24\n",
+       "AK           22"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('State').count().sort_values('location', ascending=False)[['location']].head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Looks Like there are a few errors in State that would need cleansed (at least 12)\n",
+    "\n",
+    "15 minutes is not enough to fix this by hand, I know there is a package that would do this faster that has been mentioned on talk python to me.  Ill have to look into this package another day"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "62"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df.groupby('State').count())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>State</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>DE</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OK</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>LA</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>DC</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RI</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ah</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>er</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>es</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ge</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>le</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>na</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>nd</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>on</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>t)</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>te</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       location\n",
+       "State          \n",
+       "DE            2\n",
+       "OK            2\n",
+       "LA            2\n",
+       "DC            1\n",
+       "RI            1\n",
+       "ah            1\n",
+       "er            1\n",
+       "es            1\n",
+       "ge            1\n",
+       "le            1\n",
+       "na            1\n",
+       "nd            1\n",
+       "on            1\n",
+       "t)            1\n",
+       "te            1"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('State').count().sort_values('location', ascending=False)[['location']].tail(15)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usaddress Method"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df['Address'] = df['location'].apply(usaddress.parse)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "10         [(Albuquerque,, PlaceName), (NM, StateName)]\n",
+       "11    [(Almanor, PlaceName), (West,, PlaceName), (CA...\n",
+       "12              [(Alpine,, PlaceName), (TX, StateName)]\n",
+       "13            [(Amarillo,, PlaceName), (TX, StateName)]\n",
+       "14    [(Amelia, StreetName), (Island,, StreetNamePos...\n",
+       "15           [(Anacortes,, PlaceName), (WA, StateName)]\n",
+       "16           [(Anchorage,, PlaceName), (AK, StateName)]\n",
+       "17    [(Ann, PlaceName), (Arbor,, PlaceName), (MI, S...\n",
+       "18           [(Annapolis,, PlaceName), (MD, StateName)]\n",
+       "19           [(Arlington,, PlaceName), (VA, StateName)]\n",
+       "Name: Address, dtype: object"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Address'].iloc[10:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>(Albuquerque,, PlaceName)</td>\n",
+       "      <td>(NM, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>(Almanor, PlaceName)</td>\n",
+       "      <td>(West,, PlaceName)</td>\n",
+       "      <td>(CA, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>(Alpine,, PlaceName)</td>\n",
+       "      <td>(TX, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>(Amarillo,, PlaceName)</td>\n",
+       "      <td>(TX, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>(Amelia, StreetName)</td>\n",
+       "      <td>(Island,, StreetNamePostType)</td>\n",
+       "      <td>(FL, OccupancyType)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>(Anacortes,, PlaceName)</td>\n",
+       "      <td>(WA, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>(Anchorage,, PlaceName)</td>\n",
+       "      <td>(AK, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>(Ann, PlaceName)</td>\n",
+       "      <td>(Arbor,, PlaceName)</td>\n",
+       "      <td>(MI, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>(Annapolis,, PlaceName)</td>\n",
+       "      <td>(MD, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>(Arlington,, PlaceName)</td>\n",
+       "      <td>(VA, StateName)</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                            0                              1  \\\n",
+       "10  (Albuquerque,, PlaceName)                (NM, StateName)   \n",
+       "11       (Almanor, PlaceName)             (West,, PlaceName)   \n",
+       "12       (Alpine,, PlaceName)                (TX, StateName)   \n",
+       "13     (Amarillo,, PlaceName)                (TX, StateName)   \n",
+       "14       (Amelia, StreetName)  (Island,, StreetNamePostType)   \n",
+       "15    (Anacortes,, PlaceName)                (WA, StateName)   \n",
+       "16    (Anchorage,, PlaceName)                (AK, StateName)   \n",
+       "17           (Ann, PlaceName)            (Arbor,, PlaceName)   \n",
+       "18    (Annapolis,, PlaceName)                (MD, StateName)   \n",
+       "19    (Arlington,, PlaceName)                (VA, StateName)   \n",
+       "\n",
+       "                      2     3     4     5  \n",
+       "10                 None  None  None  None  \n",
+       "11      (CA, StateName)  None  None  None  \n",
+       "12                 None  None  None  None  \n",
+       "13                 None  None  None  None  \n",
+       "14  (FL, OccupancyType)  None  None  None  \n",
+       "15                 None  None  None  None  \n",
+       "16                 None  None  None  None  \n",
+       "17      (MI, StateName)  None  None  None  \n",
+       "18                 None  None  None  None  \n",
+       "19                 None  None  None  None  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame.from_records(df['Address'].values.tolist()).iloc[10:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def get_state(lst):\n",
+    "    for tpl in lst:\n",
+    "        if tpl[1] == 'StateName':\n",
+    "            return tpl[0]\n",
+    "    return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df['State'] = df['Address'].apply(get_state)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "53"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df.groupby('State').count())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Errors? 53?\n",
+    "Looks like there were 53 states it counted PR (Puerto Rico), and DC which is good. But it also included Indianna?? not sure why, but I am impressed with the reults."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>location</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>State</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>CA</th>\n",
+       "      <td>70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WA</th>\n",
+       "      <td>27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NY</th>\n",
+       "      <td>25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OR</th>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AK</th>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MI</th>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TX</th>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PA</th>\n",
+       "      <td>19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CO</th>\n",
+       "      <td>19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WI</th>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NC</th>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>FL</th>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>VA</th>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NM</th>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OH</th>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AZ</th>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ME</th>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NJ</th>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MN</th>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MA</th>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>IL</th>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>UT</th>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>IN</th>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MD</th>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TN</th>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NH</th>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>IA</th>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GA</th>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>HI</th>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CT</th>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SD</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SC</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PR</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MS</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MO</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AR</th>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NV</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>VT</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NE</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KY</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MT</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ND</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WY</th>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>AL</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>LA</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KS</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>DE</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>OK</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>WV</th>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>RI</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Indiana</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>DC</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         location\n",
+       "State            \n",
+       "CA             70\n",
+       "WA             27\n",
+       "NY             25\n",
+       "OR             24\n",
+       "AK             22\n",
+       "MI             20\n",
+       "TX             20\n",
+       "PA             19\n",
+       "CO             19\n",
+       "WI             16\n",
+       "NC             15\n",
+       "FL             14\n",
+       "VA             13\n",
+       "NM             11\n",
+       "OH             11\n",
+       "AZ             11\n",
+       "ME             11\n",
+       "NJ             10\n",
+       "MN             10\n",
+       "MA             10\n",
+       "IL              9\n",
+       "UT              8\n",
+       "IN              8\n",
+       "ID              8\n",
+       "MD              7\n",
+       "TN              7\n",
+       "NH              7\n",
+       "IA              6\n",
+       "GA              6\n",
+       "HI              5\n",
+       "CT              5\n",
+       "SD              4\n",
+       "SC              4\n",
+       "PR              4\n",
+       "MS              4\n",
+       "MO              4\n",
+       "AR              4\n",
+       "NV              3\n",
+       "VT              3\n",
+       "NE              3\n",
+       "KY              3\n",
+       "MT              3\n",
+       "ND              3\n",
+       "WY              3\n",
+       "AL              2\n",
+       "LA              2\n",
+       "KS              2\n",
+       "DE              2\n",
+       "OK              2\n",
+       "WV              2\n",
+       "RI              1\n",
+       "Indiana         1\n",
+       "DC              1"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('State').count().sort_values('location', ascending=False)[['location']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Amarillo,', 'PlaceName'), ('TX', 'StateName')]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['Address'].values.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [conda root]",
+   "language": "python",
+   "name": "conda-root-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}