diff --git a/.ipynb_checkpoints/5min append State-checkpoint.ipynb b/.ipynb_checkpoints/5min append State-checkpoint.ipynb
new file mode 100644
index 0000000..d6d4d03
--- /dev/null
+++ b/.ipynb_checkpoints/5min append State-checkpoint.ipynb
@@ -0,0 +1,999 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 5 minute append to state column \n",
+ "\n",
+ "After taking some time to look around I found usaddress for parsing states. This notebook shows how easy it is to parse addresses with usaddress. I feel that the list of tuples is a bit clunky, but it works and I only spotted one error in the results... Indianna?? really??"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "import usaddress\n",
+ "% matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv('https://query.data.world/s/78ou6jcu4jfseul53lu1w3nio')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location | \n",
+ " crowd-low-estimate | \n",
+ " crowd-high-estimate | \n",
+ " mean-high-low | \n",
+ " source | \n",
+ " Latitude | \n",
+ " Longitude | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Abilene, TX | \n",
+ " 200 | \n",
+ " 200 | \n",
+ " 200.0 | \n",
+ " http://www.reporternews.com/story/news/local/2... | \n",
+ " 32.576489 | \n",
+ " -99.665323 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Accident, MD | \n",
+ " 54 | \n",
+ " 54 | \n",
+ " 54.0 | \n",
+ " Twitter; on-site witness | \n",
+ " 39.628700 | \n",
+ " -79.319760 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Adak, AK | \n",
+ " 10 | \n",
+ " 10 | \n",
+ " 10.0 | \n",
+ " adn.com | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Adrian, MI | \n",
+ " 130 | \n",
+ " 150 | \n",
+ " 140.0 | \n",
+ " https://www.facebook.com/events/847360115406578/ | \n",
+ " 41.889943 | \n",
+ " -84.065892 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Ajo, AZ | \n",
+ " 250 | \n",
+ " 250 | \n",
+ " 250.0 | \n",
+ " https://www.facebook.com/plugins/post.php?href... | \n",
+ " 32.384890 | \n",
+ " -112.890110 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location crowd-low-estimate crowd-high-estimate mean-high-low \\\n",
+ "0 Abilene, TX 200 200 200.0 \n",
+ "1 Accident, MD 54 54 54.0 \n",
+ "2 Adak, AK 10 10 10.0 \n",
+ "3 Adrian, MI 130 150 140.0 \n",
+ "4 Ajo, AZ 250 250 250.0 \n",
+ "\n",
+ " source Latitude Longitude \n",
+ "0 http://www.reporternews.com/story/news/local/2... 32.576489 -99.665323 \n",
+ "1 Twitter; on-site witness 39.628700 -79.319760 \n",
+ "2 adn.com 0.000000 0.000000 \n",
+ "3 https://www.facebook.com/events/847360115406578/ 41.889943 -84.065892 \n",
+ "4 https://www.facebook.com/plugins/post.php?href... 32.384890 -112.890110 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## original method\n",
+ "using last 2 characters of location"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df['State'] = df['location'].str[-2:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location | \n",
+ "
\n",
+ " \n",
+ " | State | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | CA | \n",
+ " 69 | \n",
+ "
\n",
+ " \n",
+ " | WA | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | NY | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | OR | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | AK | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location\n",
+ "State \n",
+ "CA 69\n",
+ "WA 27\n",
+ "NY 25\n",
+ "OR 24\n",
+ "AK 22"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby('State').count().sort_values('location', ascending=False)[['location']].head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Looks Like there are a few errors in State that would need cleansed (at least 12)\n",
+ "\n",
+ "15 minutes is not enough to fix this by hand, I know there is a package that would do this faster that has been mentioned on talk python to me. Ill have to look into this package another day"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "62"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df.groupby('State').count())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location | \n",
+ "
\n",
+ " \n",
+ " | State | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | DE | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | OK | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | LA | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | DC | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | RI | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | ah | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | er | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | es | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | ge | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | le | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | na | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | nd | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | on | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | t) | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | te | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location\n",
+ "State \n",
+ "DE 2\n",
+ "OK 2\n",
+ "LA 2\n",
+ "DC 1\n",
+ "RI 1\n",
+ "ah 1\n",
+ "er 1\n",
+ "es 1\n",
+ "ge 1\n",
+ "le 1\n",
+ "na 1\n",
+ "nd 1\n",
+ "on 1\n",
+ "t) 1\n",
+ "te 1"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby('State').count().sort_values('location', ascending=False)[['location']].tail(15)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Usaddress Method"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df['Address'] = df['location'].apply(usaddress.parse)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10 [(Albuquerque,, PlaceName), (NM, StateName)]\n",
+ "11 [(Almanor, PlaceName), (West,, PlaceName), (CA...\n",
+ "12 [(Alpine,, PlaceName), (TX, StateName)]\n",
+ "13 [(Amarillo,, PlaceName), (TX, StateName)]\n",
+ "14 [(Amelia, StreetName), (Island,, StreetNamePos...\n",
+ "15 [(Anacortes,, PlaceName), (WA, StateName)]\n",
+ "16 [(Anchorage,, PlaceName), (AK, StateName)]\n",
+ "17 [(Ann, PlaceName), (Arbor,, PlaceName), (MI, S...\n",
+ "18 [(Annapolis,, PlaceName), (MD, StateName)]\n",
+ "19 [(Arlington,, PlaceName), (VA, StateName)]\n",
+ "Name: Address, dtype: object"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['Address'].iloc[10:20]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 10 | \n",
+ " (Albuquerque,, PlaceName) | \n",
+ " (NM, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " (Almanor, PlaceName) | \n",
+ " (West,, PlaceName) | \n",
+ " (CA, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " (Alpine,, PlaceName) | \n",
+ " (TX, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " (Amarillo,, PlaceName) | \n",
+ " (TX, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " (Amelia, StreetName) | \n",
+ " (Island,, StreetNamePostType) | \n",
+ " (FL, OccupancyType) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " (Anacortes,, PlaceName) | \n",
+ " (WA, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " (Anchorage,, PlaceName) | \n",
+ " (AK, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " (Ann, PlaceName) | \n",
+ " (Arbor,, PlaceName) | \n",
+ " (MI, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " (Annapolis,, PlaceName) | \n",
+ " (MD, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " (Arlington,, PlaceName) | \n",
+ " (VA, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 \\\n",
+ "10 (Albuquerque,, PlaceName) (NM, StateName) \n",
+ "11 (Almanor, PlaceName) (West,, PlaceName) \n",
+ "12 (Alpine,, PlaceName) (TX, StateName) \n",
+ "13 (Amarillo,, PlaceName) (TX, StateName) \n",
+ "14 (Amelia, StreetName) (Island,, StreetNamePostType) \n",
+ "15 (Anacortes,, PlaceName) (WA, StateName) \n",
+ "16 (Anchorage,, PlaceName) (AK, StateName) \n",
+ "17 (Ann, PlaceName) (Arbor,, PlaceName) \n",
+ "18 (Annapolis,, PlaceName) (MD, StateName) \n",
+ "19 (Arlington,, PlaceName) (VA, StateName) \n",
+ "\n",
+ " 2 3 4 5 \n",
+ "10 None None None None \n",
+ "11 (CA, StateName) None None None \n",
+ "12 None None None None \n",
+ "13 None None None None \n",
+ "14 (FL, OccupancyType) None None None \n",
+ "15 None None None None \n",
+ "16 None None None None \n",
+ "17 (MI, StateName) None None None \n",
+ "18 None None None None \n",
+ "19 None None None None "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame.from_records(df['Address'].values.tolist()).iloc[10:20]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "def get_state(lst):\n",
+ " for tpl in lst:\n",
+ " if tpl[1] == 'StateName':\n",
+ " return tpl[0]\n",
+ " return None"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df['State'] = df['Address'].apply(get_state)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "53"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df.groupby('State').count())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Errors? 53?\n",
+ "Looks like there were 53 states it counted PR (Puerto Rico), and DC which is good. But it also included Indianna?? not sure why, but I am impressed with the reults."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location | \n",
+ "
\n",
+ " \n",
+ " | State | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | CA | \n",
+ " 70 | \n",
+ "
\n",
+ " \n",
+ " | WA | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | NY | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | OR | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | AK | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | MI | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | TX | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | PA | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | CO | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | WI | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | NC | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " | FL | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | VA | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " | NM | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | OH | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | AZ | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | ME | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | NJ | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | MN | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | MA | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | IL | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | UT | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | IN | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | ID | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | MD | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | TN | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | NH | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | IA | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | GA | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | HI | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | CT | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | SD | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | SC | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | PR | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | MS | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | MO | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | AR | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | NV | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | VT | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | NE | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | KY | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | MT | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | ND | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | WY | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | AL | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | LA | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | KS | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | DE | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | OK | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | WV | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | RI | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | Indiana | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | DC | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location\n",
+ "State \n",
+ "CA 70\n",
+ "WA 27\n",
+ "NY 25\n",
+ "OR 24\n",
+ "AK 22\n",
+ "MI 20\n",
+ "TX 20\n",
+ "PA 19\n",
+ "CO 19\n",
+ "WI 16\n",
+ "NC 15\n",
+ "FL 14\n",
+ "VA 13\n",
+ "NM 11\n",
+ "OH 11\n",
+ "AZ 11\n",
+ "ME 11\n",
+ "NJ 10\n",
+ "MN 10\n",
+ "MA 10\n",
+ "IL 9\n",
+ "UT 8\n",
+ "IN 8\n",
+ "ID 8\n",
+ "MD 7\n",
+ "TN 7\n",
+ "NH 7\n",
+ "IA 6\n",
+ "GA 6\n",
+ "HI 5\n",
+ "CT 5\n",
+ "SD 4\n",
+ "SC 4\n",
+ "PR 4\n",
+ "MS 4\n",
+ "MO 4\n",
+ "AR 4\n",
+ "NV 3\n",
+ "VT 3\n",
+ "NE 3\n",
+ "KY 3\n",
+ "MT 3\n",
+ "ND 3\n",
+ "WY 3\n",
+ "AL 2\n",
+ "LA 2\n",
+ "KS 2\n",
+ "DE 2\n",
+ "OK 2\n",
+ "WV 2\n",
+ "RI 1\n",
+ "Indiana 1\n",
+ "DC 1"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby('State').count().sort_values('location', ascending=False)[['location']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('Amarillo,', 'PlaceName'), ('TX', 'StateName')]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['Address'].values.tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "anaconda-cloud": {},
+ "kernelspec": {
+ "display_name": "Python [conda root]",
+ "language": "python",
+ "name": "conda-root-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.4.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/5min append State.ipynb b/5min append State.ipynb
new file mode 100644
index 0000000..d6d4d03
--- /dev/null
+++ b/5min append State.ipynb
@@ -0,0 +1,999 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 5 minute append to state column \n",
+ "\n",
+ "After taking some time to look around I found usaddress for parsing states. This notebook shows how easy it is to parse addresses with usaddress. I feel that the list of tuples is a bit clunky, but it works and I only spotted one error in the results... Indianna?? really??"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "import usaddress\n",
+ "% matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv('https://query.data.world/s/78ou6jcu4jfseul53lu1w3nio')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location | \n",
+ " crowd-low-estimate | \n",
+ " crowd-high-estimate | \n",
+ " mean-high-low | \n",
+ " source | \n",
+ " Latitude | \n",
+ " Longitude | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Abilene, TX | \n",
+ " 200 | \n",
+ " 200 | \n",
+ " 200.0 | \n",
+ " http://www.reporternews.com/story/news/local/2... | \n",
+ " 32.576489 | \n",
+ " -99.665323 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Accident, MD | \n",
+ " 54 | \n",
+ " 54 | \n",
+ " 54.0 | \n",
+ " Twitter; on-site witness | \n",
+ " 39.628700 | \n",
+ " -79.319760 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Adak, AK | \n",
+ " 10 | \n",
+ " 10 | \n",
+ " 10.0 | \n",
+ " adn.com | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Adrian, MI | \n",
+ " 130 | \n",
+ " 150 | \n",
+ " 140.0 | \n",
+ " https://www.facebook.com/events/847360115406578/ | \n",
+ " 41.889943 | \n",
+ " -84.065892 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Ajo, AZ | \n",
+ " 250 | \n",
+ " 250 | \n",
+ " 250.0 | \n",
+ " https://www.facebook.com/plugins/post.php?href... | \n",
+ " 32.384890 | \n",
+ " -112.890110 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location crowd-low-estimate crowd-high-estimate mean-high-low \\\n",
+ "0 Abilene, TX 200 200 200.0 \n",
+ "1 Accident, MD 54 54 54.0 \n",
+ "2 Adak, AK 10 10 10.0 \n",
+ "3 Adrian, MI 130 150 140.0 \n",
+ "4 Ajo, AZ 250 250 250.0 \n",
+ "\n",
+ " source Latitude Longitude \n",
+ "0 http://www.reporternews.com/story/news/local/2... 32.576489 -99.665323 \n",
+ "1 Twitter; on-site witness 39.628700 -79.319760 \n",
+ "2 adn.com 0.000000 0.000000 \n",
+ "3 https://www.facebook.com/events/847360115406578/ 41.889943 -84.065892 \n",
+ "4 https://www.facebook.com/plugins/post.php?href... 32.384890 -112.890110 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## original method\n",
+ "using last 2 characters of location"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df['State'] = df['location'].str[-2:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location | \n",
+ "
\n",
+ " \n",
+ " | State | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | CA | \n",
+ " 69 | \n",
+ "
\n",
+ " \n",
+ " | WA | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | NY | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | OR | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | AK | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location\n",
+ "State \n",
+ "CA 69\n",
+ "WA 27\n",
+ "NY 25\n",
+ "OR 24\n",
+ "AK 22"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby('State').count().sort_values('location', ascending=False)[['location']].head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Looks Like there are a few errors in State that would need cleansed (at least 12)\n",
+ "\n",
+ "15 minutes is not enough to fix this by hand, I know there is a package that would do this faster that has been mentioned on talk python to me. Ill have to look into this package another day"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "62"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df.groupby('State').count())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location | \n",
+ "
\n",
+ " \n",
+ " | State | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | DE | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | OK | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | LA | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | DC | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | RI | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | ah | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | er | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | es | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | ge | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | le | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | na | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | nd | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | on | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | t) | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | te | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location\n",
+ "State \n",
+ "DE 2\n",
+ "OK 2\n",
+ "LA 2\n",
+ "DC 1\n",
+ "RI 1\n",
+ "ah 1\n",
+ "er 1\n",
+ "es 1\n",
+ "ge 1\n",
+ "le 1\n",
+ "na 1\n",
+ "nd 1\n",
+ "on 1\n",
+ "t) 1\n",
+ "te 1"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby('State').count().sort_values('location', ascending=False)[['location']].tail(15)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Usaddress Method"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df['Address'] = df['location'].apply(usaddress.parse)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10 [(Albuquerque,, PlaceName), (NM, StateName)]\n",
+ "11 [(Almanor, PlaceName), (West,, PlaceName), (CA...\n",
+ "12 [(Alpine,, PlaceName), (TX, StateName)]\n",
+ "13 [(Amarillo,, PlaceName), (TX, StateName)]\n",
+ "14 [(Amelia, StreetName), (Island,, StreetNamePos...\n",
+ "15 [(Anacortes,, PlaceName), (WA, StateName)]\n",
+ "16 [(Anchorage,, PlaceName), (AK, StateName)]\n",
+ "17 [(Ann, PlaceName), (Arbor,, PlaceName), (MI, S...\n",
+ "18 [(Annapolis,, PlaceName), (MD, StateName)]\n",
+ "19 [(Arlington,, PlaceName), (VA, StateName)]\n",
+ "Name: Address, dtype: object"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['Address'].iloc[10:20]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 10 | \n",
+ " (Albuquerque,, PlaceName) | \n",
+ " (NM, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " (Almanor, PlaceName) | \n",
+ " (West,, PlaceName) | \n",
+ " (CA, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " (Alpine,, PlaceName) | \n",
+ " (TX, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " (Amarillo,, PlaceName) | \n",
+ " (TX, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " (Amelia, StreetName) | \n",
+ " (Island,, StreetNamePostType) | \n",
+ " (FL, OccupancyType) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " (Anacortes,, PlaceName) | \n",
+ " (WA, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " (Anchorage,, PlaceName) | \n",
+ " (AK, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " (Ann, PlaceName) | \n",
+ " (Arbor,, PlaceName) | \n",
+ " (MI, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " (Annapolis,, PlaceName) | \n",
+ " (MD, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " (Arlington,, PlaceName) | \n",
+ " (VA, StateName) | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 \\\n",
+ "10 (Albuquerque,, PlaceName) (NM, StateName) \n",
+ "11 (Almanor, PlaceName) (West,, PlaceName) \n",
+ "12 (Alpine,, PlaceName) (TX, StateName) \n",
+ "13 (Amarillo,, PlaceName) (TX, StateName) \n",
+ "14 (Amelia, StreetName) (Island,, StreetNamePostType) \n",
+ "15 (Anacortes,, PlaceName) (WA, StateName) \n",
+ "16 (Anchorage,, PlaceName) (AK, StateName) \n",
+ "17 (Ann, PlaceName) (Arbor,, PlaceName) \n",
+ "18 (Annapolis,, PlaceName) (MD, StateName) \n",
+ "19 (Arlington,, PlaceName) (VA, StateName) \n",
+ "\n",
+ " 2 3 4 5 \n",
+ "10 None None None None \n",
+ "11 (CA, StateName) None None None \n",
+ "12 None None None None \n",
+ "13 None None None None \n",
+ "14 (FL, OccupancyType) None None None \n",
+ "15 None None None None \n",
+ "16 None None None None \n",
+ "17 (MI, StateName) None None None \n",
+ "18 None None None None \n",
+ "19 None None None None "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame.from_records(df['Address'].values.tolist()).iloc[10:20]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "def get_state(lst):\n",
+ " for tpl in lst:\n",
+ " if tpl[1] == 'StateName':\n",
+ " return tpl[0]\n",
+ " return None"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df['State'] = df['Address'].apply(get_state)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "53"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df.groupby('State').count())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Errors? 53?\n",
+ "Looks like there were 53 states it counted PR (Puerto Rico), and DC which is good. But it also included Indianna?? not sure why, but I am impressed with the reults."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " location | \n",
+ "
\n",
+ " \n",
+ " | State | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | CA | \n",
+ " 70 | \n",
+ "
\n",
+ " \n",
+ " | WA | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | NY | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | OR | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | AK | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | MI | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | TX | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | PA | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | CO | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | WI | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | NC | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " | FL | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | VA | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " | NM | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | OH | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | AZ | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | ME | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | NJ | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | MN | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | MA | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | IL | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | UT | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | IN | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | ID | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | MD | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | TN | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | NH | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | IA | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | GA | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | HI | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | CT | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | SD | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | SC | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | PR | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | MS | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | MO | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | AR | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | NV | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | VT | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | NE | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | KY | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | MT | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | ND | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | WY | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | AL | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | LA | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | KS | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | DE | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | OK | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | WV | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | RI | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | Indiana | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | DC | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " location\n",
+ "State \n",
+ "CA 70\n",
+ "WA 27\n",
+ "NY 25\n",
+ "OR 24\n",
+ "AK 22\n",
+ "MI 20\n",
+ "TX 20\n",
+ "PA 19\n",
+ "CO 19\n",
+ "WI 16\n",
+ "NC 15\n",
+ "FL 14\n",
+ "VA 13\n",
+ "NM 11\n",
+ "OH 11\n",
+ "AZ 11\n",
+ "ME 11\n",
+ "NJ 10\n",
+ "MN 10\n",
+ "MA 10\n",
+ "IL 9\n",
+ "UT 8\n",
+ "IN 8\n",
+ "ID 8\n",
+ "MD 7\n",
+ "TN 7\n",
+ "NH 7\n",
+ "IA 6\n",
+ "GA 6\n",
+ "HI 5\n",
+ "CT 5\n",
+ "SD 4\n",
+ "SC 4\n",
+ "PR 4\n",
+ "MS 4\n",
+ "MO 4\n",
+ "AR 4\n",
+ "NV 3\n",
+ "VT 3\n",
+ "NE 3\n",
+ "KY 3\n",
+ "MT 3\n",
+ "ND 3\n",
+ "WY 3\n",
+ "AL 2\n",
+ "LA 2\n",
+ "KS 2\n",
+ "DE 2\n",
+ "OK 2\n",
+ "WV 2\n",
+ "RI 1\n",
+ "Indiana 1\n",
+ "DC 1"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby('State').count().sort_values('location', ascending=False)[['location']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('Amarillo,', 'PlaceName'), ('TX', 'StateName')]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['Address'].values.tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "anaconda-cloud": {},
+ "kernelspec": {
+ "display_name": "Python [conda root]",
+ "language": "python",
+ "name": "conda-root-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.4.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}