# Import libraries
import collections
import datetime
from ggplot import *
import json
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib_venn import venn2, venn2_circles
from matplotlib_venn import venn3, venn3_circles
import numpy as np
from numpy.polynomial.polynomial import polyfit
import pandas as pd
import requests
import seaborn as sns
sns.set_style('whitegrid')
sns.set_palette("bright", 10)
import sqlite3
import tweepy

C:\Users\linds\Anaconda3\envs\dand\lib\site-packages\ggplot\utils.py:81: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
C:\Users\linds\Anaconda3\envs\dand\lib\site-packages\ggplot\stats\smoothers.py:4: FutureWarning: The pandas.lib module is deprecated and will be removed in a future version. These are private functions and can be accessed from pandas._libs.lib instead
  from pandas.lib import Timestamp


# Load WeRateDogs enhanced twitter archive from Udacity.
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
twitter_archive.head(2)


twitter_archive.shape

(2356, 17)


# This loads the contents of this file into response
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)


# Save .tsv to file for future use.
with open("image_predictions.tsv", mode = 'wb') as file:
    file.write(response.content)


# Now bring image_predictions.tsv back into memory.
image_predictions = pd.read_csv('image_predictions.tsv', sep = '\t')
image_predictions.head()


len(image_predictions)

2075


# This authorizes us to use the twitter api.
consumer_key = '1U1xWVzK9GZaA7nnuz7wGx3fi'
consumer_secret = 'MsLztfRJAxMSu4PHQLBxJ9V5iCpi7QQZ08e15DKgu8bENFzxJR'
access_token = '4623585315-NCuG2MGMsxhvRviAGxJ64g5Koe0yksHTCrPlS6y'
access_secret = 'TUXMhGshFeJZBnNBjQnVElsoulBUPp5NStZrLn2MMntDv'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, parser=tweepy.parsers.JSONParser(), 
                 wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


# The api runs on tweet_id. We need a list of tweets to get the json data. 
# We are using the tweet_id from the archive.
"""
twitter_list = []
errors = []
ta_tweet_id_list = twitter_archive.tweet_id.tolist()
status_list = range(1, len(ta_tweet_id_list), 100)

# For loop to retrieve tweets in json format. We expect errors.
for index, id in enumerate(ta_tweet_id_list):
    try:
        page = api.get_status(id)
        favorites = page['favorite_count']
        retweets = page['retweet_count']
        twitter_list.append({'tweet_id': int(id),
                        'favorites': int(favorites),
                        'retweets': int(retweets)})
        if index in status_list:
            print('Completed ~', index, ' Length of twitter_list is', 
                  len(twitter_list), ' Length of errors list is', 
                  len(errors))
        else:
            pass
    except Exception as e:
        errors.append((id, e))

# Put the list of dictionaries into twitter_counts with tweet_id first.
twitter_counts = pd.DataFrame.from_dict(twitter_list)
# Reorder the columns
twitter_counts = twitter_counts[['tweet_id', 'favorites', 'retweets']]       

# Store twitter_counts on disk now that you have retrieved the data.
# It is time consuming to download this from twitter.
twitter_counts.to_csv('twitter_counts.csv', encoding='utf-8', index=False)
"""

"\ntwitter_list = []\nerrors = []\nta_tweet_id_list = twitter_archive.tweet_id.tolist()\nstatus_list = range(1, len(ta_tweet_id_list), 100)\n\n# For loop to retrieve tweets in json format. We expect errors.\nfor index, id in enumerate(ta_tweet_id_list):\n    try:\n        page = api.get_status(id)\n        favorites = page['favorite_count']\n        retweets = page['retweet_count']\n        twitter_list.append({'tweet_id': int(id),\n                        'favorites': int(favorites),\n                        'retweets': int(retweets)})\n        if index in status_list:\n            print('Completed ~', index, ' Length of twitter_list is', \n                  len(twitter_list), ' Length of errors list is', \n                  len(errors))\n        else:\n            pass\n    except Exception as e:\n        errors.append((id, e))\n\n# Put the list of dictionaries into twitter_counts with tweet_id first.\ntwitter_counts = pd.DataFrame.from_dict(twitter_list)\n# Reorder the columns\ntwitter_counts = twitter_counts[['tweet_id', 'favorites', 'retweets']]       \n\n# Store twitter_counts on disk now that you have retrieved the data.\n# It is time consuming to download this from twitter.\ntwitter_counts.to_csv('twitter_counts.csv', encoding='utf-8', index=False)\n"


# Check and make sure it reads in OK
twitter_counts = pd.read_csv('twitter_counts.csv', encoding='utf-8')
twitter_counts.head()


# Just want to make sure that tweet_id is tweet_id.
print('length of twitter_archive.tweet_id in excel spreadsheet is', 
    len('892420643555336000'))
print('length of twitter_archive.tweet_id is', 
    len('892177421306343426'))
print('length of image_predictions.tweet_id is', 
    len('666020888022790149')) 
print('length of twitter_archive tweet_id in expanded_urls is', 
    len('892420643555336193'))

length of twitter_archive.tweet_id in excel spreadsheet is 18
length of twitter_archive.tweet_id is 18
length of image_predictions.tweet_id is 18
length of twitter_archive tweet_id in expanded_urls is 18


# See if there are any duplicate column names amongst the dataframes.
columns = pd.Series(list(twitter_archive) + 
    list(image_predictions) + list(twitter_counts))
columns[columns.duplicated()]

17    tweet_id
29    tweet_id
dtype: object


# Let's see what the intersection of twitter_archive and image_predictions is.
# Want to see if anything surprising.
taid = set(twitter_archive.tweet_id)
len(taid)

2356


ipid = set(image_predictions.tweet_id)
len(ipid)

2075


common = taid.intersection(ipid).intersection(taid)
len(common)

2075


not_common = list(set(taid) - set(ipid))
len(not_common)

281


# Math works. No problem.
len(common) + len(not_common)

2356


twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB


twitter_archive.sample(5)


# Check for duplicated tweet_id's.
twitter_archive[twitter_archive.tweet_id.duplicated()]


# Most of the ratings use 10 as a denominator. How many do not?
nof_not_10_as_denom = twitter_archive.rating_denominator.value_counts().tolist()
sum(nof_not_10_as_denom) - nof_not_10_as_denom[0]

23


pd.set_option('display.max_colwidth', 200)
funny_ratings_denom = twitter_archive[
    twitter_archive.rating_denominator != 10]
funny_ratings_denom = funny_ratings_denom[['tweet_id', 'text', 
    'rating_numerator', 'rating_denominator']]
funny_ratings_denom.sample(5)


len(funny_ratings_denom)

23


twitter_archive.rating_numerator.value_counts().head(10)

12    558
11    464
10    461
13    351
9     158
8     102
7      55
14     54
5      37
6      32
Name: rating_numerator, dtype: int64


twitter_archive.rating_numerator.max()

1776


# Check suspicious numerators
funny_ratings_num = twitter_archive[
    twitter_archive.rating_numerator > 14]
funny_ratings_num = funny_ratings_num[['tweet_id', 'text', 
    'rating_numerator', 'rating_denominator']] 
funny_ratings_num.sort_values('rating_numerator', 
    ascending = False, inplace = True)
funny_ratings_num.head()


len(funny_ratings_num)

28


twitter_archive.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64


twitter_archive.name.value_counts().head()

None       745
a           55
Charlie     12
Lucy        11
Cooper      11
Name: name, dtype: int64


image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


image_predictions.head()


image_predictions.sample(5)


# Let's see if there are any odd spellings that cause a problem.
image_predictions.p1.nunique()

378


image_predictions.p1.str.lower().nunique()

378


# Let's see if there are any odd spellings that cause a problem.
image_predictions.p2.nunique()

405


image_predictions.p2.str.lower().nunique()

404


original = image_predictions.p2.unique().tolist()
lowercase = image_predictions.p2.str.lower().unique().tolist()
original = [x.lower() for x in original]


# Well cardigan is a "problem". I don't think this is serious at all.
print([item for item, count in collections.Counter(original).items() if count > 1])

['cardigan']


# Let's see if there are any odd spellings that cause a problem.
image_predictions.p3.nunique()

408


image_predictions.p3.str.lower().nunique()

408


# There are a few jpg urls that are identical. Mmm, not sure what that means. 
image_predictions[image_predictions.jpg_url.duplicated()].count()

tweet_id    66
jpg_url     66
img_num     66
p1          66
p1_conf     66
p1_dog      66
p2          66
p2_conf     66
p2_dog      66
p3          66
p3_conf     66
p3_dog      66
dtype: int64


image_predictions[image_predictions.jpg_url.duplicated(keep = False)].sort_values(
    by = 'jpg_url').head()


twitter_archive[twitter_archive.tweet_id == 675354435921575936].head()


twitter_archive[twitter_archive.tweet_id == 752309394570878976].head()


twitter_counts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2345 entries, 0 to 2344
Data columns (total 3 columns):
tweet_id     2345 non-null int64
favorites    2345 non-null int64
retweets     2345 non-null int64
dtypes: int64(3)
memory usage: 55.0 KB


twitter_counts.head()


twitter_counts[twitter_counts.tweet_id.duplicated()]


# Make copies of files
archive_clean = twitter_archive.copy()


image_clean = image_predictions.copy()


counts_clean = twitter_counts.copy()


archive_clean = archive_clean[
    archive_clean.retweeted_status_user_id.isnull()]
archive_clean = archive_clean[
    archive_clean.in_reply_to_user_id.isnull()]
archive_clean.drop(['in_reply_to_user_id', 'in_reply_to_status_id', 
    'retweeted_status_id', 'retweeted_status_user_id',
    'retweeted_status_timestamp'], inplace = True, axis = 1)


archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
doggo                 2097 non-null object
floofer               2097 non-null object
pupper                2097 non-null object
puppo                 2097 non-null object
dtypes: int64(3), object(9)
memory usage: 213.0+ KB


archive_clean.head(2)


archive_clean.timestamp = pd.to_datetime(
    archive_clean.timestamp, infer_datetime_format=True)


archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null datetime64[ns]
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
doggo                 2097 non-null object
floofer               2097 non-null object
pupper                2097 non-null object
puppo                 2097 non-null object
dtypes: datetime64[ns](1), int64(3), object(8)
memory usage: 213.0+ KB


# Subset that has only valid ratings.
archive_clean = archive_clean[archive_clean.rating_numerator < 15]
archive_clean = archive_clean[archive_clean.rating_denominator == 10]


archive_clean['rating'] = archive_clean.rating_numerator / archive_clean.rating_denominator


archive_clean.drop(['rating_numerator', 'rating_denominator'], 
   inplace = True, axis = 1)


len(archive_clean)

2075


archive_clean.head(2)


# Set up gender lists
male = ['boy', 'he', "he's", 'hes', "he's", 'him', 'himself', 'his',  
         'male', 'stud']
female = ['bitch', 'female', 'girl', 'her', 'hers', "her's",
          'herself', 'she', 'shes', "she's"]

# Create calc gender function
def calc_gender(text):
    text = text.replace("'", "") # Encoding problems (sigh)
    text = text.lower().split()
    for word in text:
        if word in male:
            return 0
        elif word in female:
            return 1
        else:
            return np.nan


# Apply the function.
archive_clean['gender'] = archive_clean['text'].apply(calc_gender)


archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id         2075 non-null int64
timestamp        2075 non-null datetime64[ns]
source           2075 non-null object
text             2075 non-null object
expanded_urls    2072 non-null object
name             2075 non-null object
doggo            2075 non-null object
floofer          2075 non-null object
pupper           2075 non-null object
puppo            2075 non-null object
rating           2075 non-null float64
gender           1399 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(1), object(8)
memory usage: 210.7+ KB


archive_clean.gender.head(10)

0    0.0
1    1.0
2    0.0
3    1.0
4    0.0
5    NaN
6    0.0
7    0.0
8    1.0
9    1.0
Name: gender, dtype: float64


archive_clean.text.head(10)

0                                                             This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU
1        This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV
2                         This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB
3                                                                   This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ
4        This is Franklin. He would like you to stop calling him "cute." He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f
5        Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek https://t.co/kQ04fDDRmh
6    Meet Jax. He enjoys ice cream so much he gets nervous around it. 13/10 help Jax enjoy more things by clicking below\n\nhttps://t.co/Zr4hWfAs1H https://t.co/tVJBRMnhxl
7                            When you watch your owner call another dog a good boy but then they turn back to you and say you're a great boy. 13/10 https://t.co/v0nONBcwxq
8                        This is Zoey. She doesn't want to be one of the scary sharks. Just wants to be a snuggly pettable boatpet. 13/10 #BarkWeek https://t.co/9TwLuAGH0b
9             This is Cassie. She is a college pup. Studying international doggo communication and stick theory. 14/10 so elegant much sophisticate https://t.co/t1bfwz5S2A
Name: text, dtype: object


columns = ['name', 'doggo', 'floofer', 'pupper', 'puppo']
for column in columns:
    archive_clean[column].replace("None", np.nan, inplace = True)


for column in columns:
    print(archive_clean[archive_clean[column] == "None"])

Empty DataFrame
Columns: [tweet_id, timestamp, source, text, expanded_urls, name, doggo, floofer, pupper, puppo, rating, gender]
Index: []
Empty DataFrame
Columns: [tweet_id, timestamp, source, text, expanded_urls, name, doggo, floofer, pupper, puppo, rating, gender]
Index: []
Empty DataFrame
Columns: [tweet_id, timestamp, source, text, expanded_urls, name, doggo, floofer, pupper, puppo, rating, gender]
Index: []
Empty DataFrame
Columns: [tweet_id, timestamp, source, text, expanded_urls, name, doggo, floofer, pupper, puppo, rating, gender]
Index: []
Empty DataFrame
Columns: [tweet_id, timestamp, source, text, expanded_urls, name, doggo, floofer, pupper, puppo, rating, gender]
Index: []


archive_clean.source.unique().tolist()

['<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
 '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>',
 '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>']


source_html = archive_clean.source.unique().tolist()
replace_html = ["iPhone", "Twitter", "Vine", "TweetDeck"]
archive_clean.source.replace(
    source_html, replace_html, inplace = True)
archive_clean.source = archive_clean.source.astype('category')


archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id         2075 non-null int64
timestamp        2075 non-null datetime64[ns]
source           2075 non-null category
text             2075 non-null object
expanded_urls    2072 non-null object
name             1485 non-null object
doggo            83 non-null object
floofer          10 non-null object
pupper           229 non-null object
puppo            24 non-null object
rating           2075 non-null float64
gender           1399 non-null float64
dtypes: category(1), datetime64[ns](1), float64(2), int64(1), object(7)
memory usage: 196.7+ KB


# Show some offending values
archive_clean['name'].value_counts().head()

a          54
Charlie    11
Lucy       11
Cooper     10
Oliver     10
Name: name, dtype: int64


archive_clean['name'].head(10)

0     Phineas
1       Tilly
2      Archie
3       Darla
4    Franklin
5         NaN
6         Jax
7         NaN
8        Zoey
9      Cassie
Name: name, dtype: object


# There are some numbers in here (sigh)
archive_clean['name'] = archive_clean['name'].astype(str)

# Regex that finds all lowercase and numbers and causes them to return a False value which in turn, returns a NaN
archive_clean['name'] = archive_clean['name'].mask(archive_clean['name'].str.match(r'^[0-9a-z\s]+$'))


archive_clean['name'].value_counts().head()

Charlie    11
Lucy       11
Oliver     10
Cooper     10
Penny       9
Name: name, dtype: int64


archive_clean.tweet_id = archive_clean.tweet_id.astype(str)


archive_clean.tweet_id.sample(5)

404     824025158776213504
196     854732716440526848
1955    673636718965334016
474     816091915477250048
1081    738885046782832640
Name: tweet_id, dtype: object


# Produce a mask that will only result in tweet_ids != to 18. 
# This should create an empty Series.
mask = (archive_clean.tweet_id.str.len() != 18)
test = archive_clean.tweet_id.loc[mask]
test.head()

Series([], Name: tweet_id, dtype: object)


archive_clean.head()


# Lets count the numbers in each column first
dog_stages = ['doggo', 'floofer', 'pupper', 'puppo']
for stage in dog_stages:
    display(archive_clean.groupby(stage).count())


# function that uses sets to create one column in archive_clean using sets

# create a set first
dog_stages = set(dog_stages)

def calc_dog_stage(stages):
    
    # stages is a list. Convert it to a set
    stages = set(stages)
    
    # This gives you all possible combinations
    stages = stages.intersection(dog_stages)
    
    # Check and see if there is anything in stages
    if len(stages) > 0:
    
        # This concatenates the multiple stages (if they exist)
        stages = '_'.join(stages)

        return stages
    
    else:
        # Make sure it returns a NaN if there are no stages
        return np.nan


archive_clean['dog_stages'] = archive_clean[['doggo', 'floofer', 
    'pupper', 'puppo']].apply(calc_dog_stage, axis = 1)


archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], 
    inplace = True, axis = 1)
archive_clean.dog_stages = archive_clean.dog_stages.astype('category')


archive_clean.head(2)


archive_clean.dog_stages.value_counts()

pupper           220
doggo             72
puppo             23
floofer            9
doggo_pupper       9
floofer_doggo      1
doggo_puppo        1
Name: dog_stages, dtype: int64


archive_clean.dog_stages.nunique()

7


archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2355
Data columns (total 9 columns):
tweet_id         2075 non-null object
timestamp        2075 non-null datetime64[ns]
source           2075 non-null category
text             2075 non-null object
expanded_urls    2072 non-null object
name             1384 non-null object
rating           2075 non-null float64
gender           1399 non-null float64
dog_stages       335 non-null category
dtypes: category(2), datetime64[ns](1), float64(2), object(4)
memory usage: 134.3+ KB


image_clean.rename(columns = {'img_num': 'nof_images', 
                               'p1': 'best_prediction', 
                               'p1_conf': 'confidence', 
                               'p1_dog': 'dog_or_not'}, inplace = True)


image_clean.best_prediction = image_clean.best_prediction.str.capitalize()


image_clean.best_prediction = image_clean.best_prediction.astype('category')


image_clean.drop(['p2', 'p2_conf', 'p2_dog', 'p3', 
                  'p3_conf', 'p3_dog'], inplace = True, axis = 1)


# Reorder columns
image_clean = image_clean[['tweet_id', 'best_prediction', 'confidence', 
                           'dog_or_not', 'nof_images', 'jpg_url']]


image_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 6 columns):
tweet_id           2075 non-null int64
best_prediction    2075 non-null category
confidence         2075 non-null float64
dog_or_not         2075 non-null bool
nof_images         2075 non-null int64
jpg_url            2075 non-null object
dtypes: bool(1), category(1), float64(1), int64(2), object(1)
memory usage: 84.0+ KB


image_clean.head(2)


image_clean.best_prediction.value_counts().head(5)

Golden_retriever      150
Labrador_retriever    100
Pembroke               89
Chihuahua              83
Pug                    57
Name: best_prediction, dtype: int64


image_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 6 columns):
tweet_id           2075 non-null int64
best_prediction    2075 non-null category
confidence         2075 non-null float64
dog_or_not         2075 non-null bool
nof_images         2075 non-null int64
jpg_url            2075 non-null object
dtypes: bool(1), category(1), float64(1), int64(2), object(1)
memory usage: 84.0+ KB


image_clean.tweet_id = image_clean.tweet_id.astype(str)


image_clean.tweet_id.sample(5)

1153    732726085725589504
730     686618349602762752
877     698342080612007937
815     692752401762250755
1987    872620804844003328
Name: tweet_id, dtype: object


# Produce a mask that will only result in tweet_ids != to 18. 
# This should create an empty Series.
mask = (image_clean.tweet_id.str.len() != 18)
test = image_clean.tweet_id.loc[mask]
test.head()

Series([], Name: tweet_id, dtype: object)


counts_clean.tweet_id = counts_clean.tweet_id.astype(str)


counts_clean.tweet_id.sample(5)

1964    672997845381865473
1249    710269109699739648
2041    671486386088865792
200     852912242202992640
1483    692568918515392513
Name: tweet_id, dtype: object


# Produce a mask that will only result in tweet_ids != to 18. 
# This should create an empty Series.
mask = (counts_clean.tweet_id.str.len() != 18)
test = counts_clean.tweet_id.loc[mask]
test.head()

Series([], Name: tweet_id, dtype: object)


twitter_archive_master = archive_clean.merge(counts_clean, on = 'tweet_id')
twitter_archive_master = twitter_archive_master.merge(image_clean, 
  on = 'tweet_id', how = 'left')
twitter_archive_master.drop(['source', 'text', 'expanded_urls'], 
  inplace = True, axis = 1)


twitter_archive_master.head(2)


twitter_archive_master.shape

(2075, 13)


# Want to find out the overlap between twitter_archive_master and image_clean.
c = twitter_archive_master.tweet_id.tolist()
c = set(c)
ic = image_clean.tweet_id.tolist()
ic = set(ic)
u = set.intersection(c, ic)
len(u)

1949


# Store tables on disk.
twitter_archive_master.to_csv('twitter_archive_master.csv', encoding='utf-8', index=False)
archive_clean.to_csv('archive_clean.csv', encoding='utf-8', index=False)
image_clean.to_csv('image_clean.csv', encoding='utf-8', index=False)
counts_clean.to_csv('counts_clean.csv', encoding='utf-8', index=False)


# For practise, also store in a database.
# Connect to the sqlite database
database = 'weratedogs.db'
conn = sqlite3.connect(database)


twitter_archive_master.to_sql('twitter_archive_master', conn, if_exists = 'replace', index = False)
archive_clean.to_sql('archive_clean', conn, if_exists = 'replace', index = False)
image_clean.to_sql('image_clean', conn, if_exists = 'replace', index = False)
counts_clean.to_sql('counts_clean', conn, if_exists = 'replace', index = False)


# Make sure they all read back in from sqlite db.
twitter_archive_master = pd.read_sql('SELECT * FROM twitter_archive_master', conn) 
archive_clean = pd.read_sql('SELECT * FROM archive_clean', conn) 
image_clean = pd.read_sql('SELECT * FROM image_clean', conn)
counts_clean = pd.read_sql('SELECT * FROM counts_clean', conn)


twitter_archive_master.head(1)


archive_clean.head(1)


image_clean.head(1)


# Check and make sure they read back fine from .csv.
twitter_archive_master = pd.read_csv(
    'twitter_archive_master.csv', encoding='utf-8')
archive_clean = pd.read_csv(
    'archive_clean.csv', encoding='utf-8')
image_clean = pd.read_csv(
    'image_clean.csv', encoding='utf-8')
counts_clean = pd.read_csv(
    'counts_clean.csv', encoding='utf-8')


twitter_archive_master.head(1)


# Is coming back as float. Shoud be string. 
twitter_archive_master.tweet_id = twitter_archive_master.tweet_id.astype(str)


archive_clean.head(1)


image_clean.head(1)


counts_clean.head(1)


counts_clean.head(1)


twitter_archive_master.describe()


twitter_archive_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 13 columns):
tweet_id           2075 non-null object
timestamp          2075 non-null object
name               1384 non-null object
rating             2075 non-null float64
gender             1399 non-null float64
dog_stages         335 non-null object
favorites          2075 non-null int64
retweets           2075 non-null int64
best_prediction    1949 non-null object
confidence         1949 non-null float64
dog_or_not         1949 non-null object
nof_images         1949 non-null float64
jpg_url            1949 non-null object
dtypes: float64(4), int64(2), object(7)
memory usage: 210.8+ KB


twitter_archive_master.head()


# Ratings
plt.figure(figsize = (8, 5))
plt.xlabel('Ratings')
plt.ylabel('Count')
bins = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, 1.3, 1.4]
plt.hist(x = twitter_archive_master.rating, 
         edgecolor='white', linewidth=8, bins = bins)
plt.title('Histogram of Dog Ratings');


twitter_archive_master.gender.value_counts()

0.0    1053
1.0     346
Name: gender, dtype: int64


mfg = twitter_archive_master.groupby('gender').count()
mfg.tweet_id.head()

gender
0.0    1053
1.0     346
Name: tweet_id, dtype: int64


twitter_archive_master.gender.mean()

0.24731951393852752


twitter_archive_master.isnull().mean()

tweet_id           0.000000
timestamp          0.000000
name               0.333012
rating             0.000000
gender             0.325783
dog_stages         0.838554
favorites          0.000000
retweets           0.000000
best_prediction    0.060723
confidence         0.060723
dog_or_not         0.060723
nof_images         0.060723
jpg_url            0.060723
dtype: float64


# Histograms of favorites and retweets
plt.figure(figsize=(8, 5))
plt.xlim(0, 60000)
plt.ylabel('Count')
plt.hist(twitter_archive_master.favorites, alpha=.5, label='# Favorites')
plt.hist(twitter_archive_master.retweets, alpha=.5, label='# Retweets')
plt.title('Distribution of Favorites and Retweets Counts')
plt.legend();


log10 = twitter_archive_master.copy()
log10 = log10[['favorites', 'retweets', 'confidence', 'timestamp']]
log10['favorites'] = twitter_archive_master.favorites.apply(lambda x: np.log10(x))
log10['retweets'] = twitter_archive_master.retweets.apply(lambda x: np.log10(x))
log10['confidence'] = twitter_archive_master.confidence.apply(lambda x: np.log10(x))


# Log histogram favorites and retweets
plt.figure(figsize=(12, 8))
plt.ylabel('Count')
plt.hist(log10.favorites, alpha=.5, label='# Favorites')
plt.hist(log10.retweets, alpha=.5, label='# Retweets')
plt.title('Log10 Distribution of Favorites and Retweets Counts')
plt.legend();


# Histogram of Confidence Factor for Pictures
# Had to get rid of NaNs. Would not work unless I did.
sns.set_style('whitegrid')
plt.figure(figsize=(8, 5))
plt.xlim(.1, 1)
plt.xlabel('Confidence')
plt.ylabel('Count')
plt.hist(x = twitter_archive_master.confidence[
    ~np.isnan(twitter_archive_master.confidence)], 
    edgecolor='white', linewidth=1.2)
plt.title('Histogram of Confidence Factor For Pictures');


# Lets try a log
# Histogram of Confidence Factor for Pictures
# Had to get rid of NaNs. Would not work unless I did.
sns.set_style('whitegrid')
plt.figure(figsize=(8, 5))
plt.xlabel('Confidence')
plt.ylabel('Count')
plt.hist(x = log10.confidence[~np.isnan(log10.confidence)], 
        edgecolor='white', linewidth=1.2)
plt.title('Log10 Histogram of Confidence Factor For Pictures');


twitter_archive_master.boxplot(column='rating', 
   by = 'dog_stages', figsize=(8, 5));


twitter_archive_master.groupby('dog_stages')['rating'].describe()


# Compute the correlation matrix
corr = twitter_archive_master[['rating', 'gender', 
    'favorites', 'retweets', 'confidence', 'nof_images']].corr()
corr


# This is largely taken from 
# https://seaborn.pydata.org/examples/many_pairwise_correlations.html
# There are a couple of minor tweaks.

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 8))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.set_style('darkgrid')
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .8});


# Scatter plot of favorites and retweets.
plt.figure(figsize=(12, 8))
plt.xlim(0, 40000)
plt.ylim(0, 8000)
plt.xlabel('Favorites Count')
plt.ylabel('Retweets Count')
plt.plot(twitter_archive_master.favorites, 
    twitter_archive_master.retweets, linestyle='', 
     marker='o', markersize=0.7)
plt.title('Scatter Plot of Favorites and Retweets');


# Log10 scatter plot of favorites and retweets.
plt.figure(figsize=(12, 8))
plt.xlabel('Favorites Count')
plt.ylabel('Retweets Count')
plt.plot(log10.favorites, log10.retweets, 
           linestyle='', marker='o', markersize=0.7)
plt.title('Scatter Plot of Log10 Favorites and Retweets');


log10.favorites.corr(log10.retweets)


# Favorites and retweets are a proxy for one another, lets look at retweets over time.
time_log10_retweets = log10[['retweets', 'timestamp']]
time_log10_retweets = time_log10_retweets[['retweets', 'timestamp']]
time_log10_retweets.timestamp = pd.to_datetime(time_log10_retweets['timestamp'], 
                                             format='%Y-%m-%d %H:%M:%S.%f')
time_log10_retweets.set_index(['timestamp'], inplace=True)
sns.set_style('whitegrid')
time_log10_retweets.plot(figsize=(12, 8), title = 'Time vs Log10 Retweets', 
           linestyle = '', marker = 'o', markersize = 0.7);


# Make sure timestamp works for the groupby function by date.
twitter_archive_master.timestamp = pd.to_datetime(
    twitter_archive_master['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
cg_month_tweets = twitter_archive_master.groupby(
    pd.Grouper(key = 'timestamp', freq = "M")).count().reset_index()
cg_month_tweets = cg_month_tweets[['timestamp', 'tweet_id']]
cg_month_tweets.head()


# Check sum on total rows aggregated in groupby.
cg_month_tweets.sum()


# Line plot of Time vs Tweets.
sns.set_style('whitegrid')
plt.figure(figsize=(12, 8))
plt.xlim([datetime.date(2015, 11, 30), datetime.date(2017, 7, 30)]) 
plt.xlabel('Year and Month')
plt.ylabel('Tweets Count')
plt.plot(cg_month_tweets.timestamp, cg_month_tweets.tweet_id)
plt.title('Line Plot of Time vs Tweets');


cg_day_favorites = twitter_archive_master.groupby(
    pd.Grouper(key = 'timestamp', freq = "D")).sum().reset_index()
cg_day_favorites = cg_day_favorites[['timestamp', 'favorites']]
cg_day_favorites.head()


 # Plot Time vs Favorites Since Inception By Day Count
sns.set_style('darkgrid')
plt.ylim(0, 125000)
plt.xlabel('Days Since First Tweet')
plt.ylabel('Favorites per Day')
plt.title('Time vs Favorites', fontsize = 14)
cg_day_favorites.favorites.plot(figsize=(12, 8), 
           linestyle = '', marker = 'o', markersize = 1)

# Fit a regression line.
intercept, multiplier = polyfit(cg_day_favorites.index.values, 
            cg_day_favorites.favorites, 1)
label = "favorites/day = ", round(intercept, 2), "+ days * ", round(multiplier, 2)
plt.text(440, 45000, label)
plt.plot(cg_day_favorites.index.values, 
         intercept + multiplier * cg_day_favorites.index.values, '-');


print(intercept, multiplier)


dog_stages_counts = twitter_archive_master.dog_stages.value_counts()
dog_stages_counts


labels = []
denominator = dog_stages_counts.sum()

for index, count in enumerate(dog_stages_counts):
    label_first_part = dog_stages_counts.index.values[index]
    label_second_part = (count / denominator) * 100
    label_second_part = round(label_second_part, 2)
    label_second_part = str(label_second_part) + '%'
    label = label_first_part + ' ' + label_second_part
    labels.append(label)


# pie plot
plt.figure(figsize=(12, 8))
plt.pie(dog_stages_counts, explode = (.05,) * len(labels), 
        shadow = True, startangle = 90)
plt.legend(labels, prop={'size': 14})
plt.title('Share of Dog Stages');


rdsg = twitter_archive_master.groupby('dog_stages').mean().reset_index()
rdsg = rdsg[['dog_stages', 'rating']]
rdsg.head()


 # Plot dog_stages vs rating(s)
sns.set_style('darkgrid')
plt.figure(figsize=(8, 5))

ind = np.arange(len(rdsg))    # the locations for the groups on the x axis
width = .5                   # the width of the bars

p1 = plt.bar(ind, rdsg.rating, width)

plt.xlabel('Dog Stages')
plt.ylabel('Average Rating(s)')
plt.title('Effect of Dog Stages on Rating(s)')
plt.xticks(ind, rdsg.dog_stages)
plt.yticks(np.arange(0, 1.4, .2));


fav_breed = twitter_archive_master[
    twitter_archive_master.dog_or_not == True]
fav_breed = fav_breed.groupby(
    'best_prediction').sum().reset_index()
fav_breed = fav_breed[[
    'best_prediction', 'favorites']]
fav_breed = fav_breed.sort_values(
    'favorites', ascending = False)[0:20]
fav_breed


# Bar graph of most 'favorites' breed.
sns.set_style('whitegrid')
plt.figure(figsize=(12, 8))
plt.barh(range(len(fav_breed.best_prediction)), fav_breed.favorites) 
plt.yticks(range(len(fav_breed.best_prediction)),fav_breed.best_prediction)
plt.xlabel('Sum of Favorites')
plt.ylabel('Dog Breed')
plt.title('Bar Chart of The Breeds That Had The Largest Total Number of Favorites');


twitter_archive_master.name.value_counts().head(10)


top_rated_breeds = twitter_archive_master[
    twitter_archive_master.dog_or_not == True]
top_rated_breeds = top_rated_breeds.groupby(
    'best_prediction').mean().reset_index()
top_rated_breeds = top_rated_breeds[[
    'best_prediction', 'rating']]
top_rated_breeds['log10_rating'] = top_rated_breeds.rating.apply(
    lambda x: np.log10(x) - .05)
top_rated_breeds = top_rated_breeds.sort_values(
    'log10_rating', ascending = False)[0:20]
top_rated_breeds


# Bar graph of top 'favorites' rated breeds.
sns.set_style('whitegrid')
plt.figure(figsize = (12, 8))
xlim(.05, .1)
plt.barh(range(len(top_rated_breeds.best_prediction)), 
         top_rated_breeds.log10_rating) 
plt.yticks(range(len(top_rated_breeds.best_prediction)),
           top_rated_breeds.best_prediction)
plt.xlabel('Log10 * rating - .05')
plt.ylabel('Dog Breed')
plt.title('Relative Breed Rating Bar Chart' );


fav_breed_list = fav_breed.best_prediction.tolist()
top_rated_list = top_rated_breeds.best_prediction.tolist()


fav_breed_set = set(fav_breed_list)
top_rated_set = set(top_rated_list)


common = fav_breed_set & top_rated_set
common


plt.figure(figsize = (12, 8))
v = venn2(subsets = {'10': len(fav_breed_list), '01': len(top_rated_list), '11': len(common)}, 
          set_labels = ('Favorite Breeds', 'Top Rated Breeds'))
plt.annotate(common, xy = v.get_label_by_id('11').get_position(),  
    xytext = (0,70), ha = 'center', textcoords = 'offset points', 
    bbox = dict(boxstyle = 'round, pad = 0.5', fc = 'lime', alpha = 0.3),
    arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3, rad = 0', 
    color = 'gray'))
plt.title('Venn Diagram of Intersection Between Favorite and Top Rated Breeds');


cg_month_retweets = twitter_archive_master.groupby(
    pd.Grouper(key = 'timestamp', freq = "M")).sum().reset_index()
cg_month_retweets = cg_month_retweets[['timestamp', 'retweets']]
cg_month_retweets.head()


# Divide favorites by 1000 to get both tweets and retweets on the same graph.
# You want to show relative direction. 
sns.set_style('darkgrid')
plt.figure(figsize=(12, 8))
plt.xlim([datetime.date(2015, 11, 30), datetime.date(2017, 7, 30)]) 
plt.xlabel('Year and Month')
plt.ylabel('Tweets and (Retweets/1000)')
plt.plot(cg_month_retweets.timestamp, cg_month_retweets.retweets/1000)
plt.plot(cg_month_tweets.timestamp, cg_month_tweets.tweet_id)
plt.legend(('Retweets', 'Tweets'));
plt.title('Line Plot of Time vs Tweets vs (Retweets/1000)');


twitter_archive_master = twitter_archive_master.copy()
trf = twitter_archive_master[['timestamp', 'rating', 'favorites']]
trf = trf.groupby(pd.Grouper(
    key = 'timestamp', freq = "D")).mean().reset_index()
trf.head()


trf.boxplot(column='rating');


rating_breaks = trf.rating.describe()
rating_breaks


rating_breaks[6], rating_breaks[4]


# Create a categorical variable that has the numbers 1, 2, 3 in it 
# for ratings from the highest to the lowest.
trf['rating_class'] = trf.rating.apply(lambda x: 
   1 if x > rating_breaks[6] else (2 if x > rating_breaks[4] 
       and x <= rating_breaks[6] else 3))
trf['days'] = trf.index.values + 1
trf.head(2)


ggplot(trf, aes(x= 'days', y = 'favorites', color = 'rating_class')) + \
    scale_y_continuous(limits=(0,60000)) +\
    geom_point(alpha = .7) +\
    scale_color_gradient(low = 'red', high = 'blue') +\
    labs(title="Change in Rating's Class Over Time and Favorites", 
         x="Days From Start of @dog_rates", y="Favorites")


trf[trf.timestamp == '2016-09-12']


# For every individual rating we need the average of retweets and favorites.


# Get unique values for rating(s)
rating_values = twitter_archive_master.rating.unique()
rating_values


# Create containers
rf = pd.DataFrame()
rr = pd.DataFrame()
rf_dict = {}
rr_dict = {}


# For loop for creating averages
for score in rating_values:
    rf = twitter_archive_master[
        twitter_archive_master.rating == score]
    rf_mean = rf.favorites.mean()
    rf_dict[score] = rf_mean
    rr = twitter_archive_master[
        twitter_archive_master.rating == score]
    rr_mean = rf.retweets.mean()
    rr_dict[score] = rr_mean


# Creating dataframe to hold the results and doing some cleanup
rfr = pd.DataFrame.from_dict(rf_dict, orient='index')
rfr.rename({0: 'fav_mean'}, inplace = True, axis = 1)
rfr['retweets_mean'] = rr_dict.values()
rfr.reset_index(inplace = True)
rfr.rename({'index': 'ratings'}, inplace = True, axis = 1)
rfr.sort_values('ratings', inplace = True)

rfr


sns.set_style('darkgrid')
plt.figure(figsize=(12, 8))
plt.xlim(0, 1.4)
plt.plot(rfr.ratings, rfr.retweets_mean)
plt.plot(rfr.ratings, rfr.fav_mean)
plt.xlabel('Rating(s) From 0/14 to 14/14')
plt.ylabel('Counts')
plt.title('Retweets and Favorites over Rating(s)')
plt.legend(('Average of Retweets', 'Average of Favorites'));


sns.set_style('darkgrid')
plt.figure(figsize=(12, 8))

ind = np.arange(len(rfr))    # the locations for the groups on the x axis
width = .5                   # the width of the bars

p1 = plt.bar(ind, rfr.retweets_mean, width)
p2 = plt.bar(ind, rfr.fav_mean, width, 
             bottom = rfr.retweets_mean)

plt.xlabel('Rating(s) From 0/14 to 14/14')
plt.ylabel('Counts')
plt.title('Retweets and Favorites over Rating(s)')
plt.xticks(ind, rfr.ratings)
plt.yticks(np.arange(0, 40000, 5000))
plt.legend((p1[0], p2[0]), ('Average of Favorites', 'Average of Retweets'));


max_retweets = twitter_archive_master.retweets.max()
twitter_archive_master[
    twitter_archive_master.retweets == max_retweets]

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	666020888022790149	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg	1	Welsh_springer_spaniel	0.465074	True	collie	0.156665	True	Shetland_sheepdog	0.061428	True
1	666029285002620928	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg	1	redbone	0.506826	True	miniature_pinscher	0.074192	True	Rhodesian_ridgeback	0.072010	True
2	666033412701032449	https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg	1	German_shepherd	0.596461	True	malinois	0.138584	True	bloodhound	0.116197	True
3	666044226329800704	https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg	1	Rhodesian_ridgeback	0.408143	True	redbone	0.360687	True	miniature_pinscher	0.222752	True
4	666049248165822465	https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg	1	miniature_pinscher	0.560311	True	Rottweiler	0.243682	True	Doberman	0.154629	True

	tweet_id	favorites	retweets
0	892420643555336193	38690	8558
1	892177421306343426	33164	6292
2	891815181378084864	24963	4174
3	891689557279858688	42070	8681
4	891327558926688256	40226	9451

	tweet_id	text	rating_numerator	rating_denominator
1228	713900603437621249	Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1	99	90
1662	682962037429899265	This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5	7	11
1433	697463031882764288	Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ	44	40
1120	731156023742988288	Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv	204	170
433	820690176645140481	The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd	84	70

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	666020888022790149	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg	1	Welsh_springer_spaniel	0.465074	True	collie	0.156665	True	Shetland_sheepdog	0.061428	True
1	666029285002620928	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg	1	redbone	0.506826	True	miniature_pinscher	0.074192	True	Rhodesian_ridgeback	0.072010	True
2	666033412701032449	https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg	1	German_shepherd	0.596461	True	malinois	0.138584	True	bloodhound	0.116197	True
3	666044226329800704	https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg	1	Rhodesian_ridgeback	0.408143	True	redbone	0.360687	True	miniature_pinscher	0.222752	True
4	666049248165822465	https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg	1	miniature_pinscher	0.560311	True	Rottweiler	0.243682	True	Doberman	0.154629	True

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
1964	867072653475098625	https://pbs.twimg.com/media/DAElHfmUMAEH9lB.jpg	1	Blenheim_spaniel	0.352946	True	papillon	0.211766	True	Pekinese	0.112952	True
378	673240798075449344	https://pbs.twimg.com/media/CVfU7KLXAAAAgIa.jpg	1	Airedale	0.443004	True	brown_bear	0.114162	False	Chesapeake_Bay_retriever	0.094639	True
727	686377065986265092	https://pbs.twimg.com/media/CYaAS2kUoAINkye.jpg	1	German_shepherd	0.830816	True	Leonberg	0.076325	True	bloodhound	0.037449	True
1019	710140971284037632	https://pbs.twimg.com/media/Cdrtcr-W4AAqi5H.jpg	1	Pekinese	0.953170	True	papillon	0.019517	True	Japanese_spaniel	0.005821	True
1000	708711088997666817	https://pbs.twimg.com/media/CdXY-GHWoAALing.jpg	2	tennis_ball	0.912961	False	German_short-haired_pointer	0.052695	True	Labrador_retriever	0.018477	True

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
0	892420643555336193	NaN	NaN	2017-08-01 16:23:56 +0000	<a href="http://twitter.com/download/iphone" r...	This is Phineas. He's a mystical boy. Only eve...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892420643...	13	10	Phineas	None	None	None	None
1	892177421306343426	NaN	NaN	2017-08-01 00:17:27 +0000	<a href="http://twitter.com/download/iphone" r...	This is Tilly. She's just checking pup on you....	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892177421...	13	10	Tilly	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
1945	673707060090052608	NaN	NaN	2015-12-07 03:34:14 +0000	<a href="http://twitter.com/download/iphone" r...	This is Raymond. He's absolutely terrified of ...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/673707060...	10	10	Raymond	None	None	pupper	None
399	824796380199809024	NaN	NaN	2017-01-27 01:49:15 +0000	<a href="http://twitter.com/download/iphone" r...	RT @dog_rates: This is Bailey. She loves going...	7.950767e+17	4.196984e+09	2016-11-06 01:33:58 +0000	https://twitter.com/dog_rates/status/795076730...	11	10	Bailey	None	None	None	None
1592	686394059078897668	NaN	NaN	2016-01-11 03:47:50 +0000	<a href="http://vine.co" rel="nofollow">Vine -...	This pup's having a nightmare that he forgot t...	NaN	NaN	NaN	https://vine.co/v/iMqBebnOvav	12	10	None	None	None	None	None
472	816450570814898180	NaN	NaN	2017-01-04 01:05:59 +0000	<a href="http://twitter.com/download/iphone" r...	Meet Moose. He doesn't want his friend to go b...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/816450570...	13	10	Moose	None	None	None	None
1114	732732193018155009	NaN	NaN	2016-05-18 00:39:02 +0000	<a href="http://twitter.com/download/iphone" r...	This is Ralpher. He's an East Guinean Flop Dog...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/732732193...	12	10	Ralpher	None	None	None	None

	tweet_id	text	rating_numerator	rating_denominator
979	749981277374128128	This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh	1776	10
313	835246439529840640	@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho	960	0
189	855860136149123072	@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10	666	10
2074	670842764863651840	After so many requests... here you go.\n\nGood dogg. 420/10 https://t.co/yfAAo1gdeY	420	10
188	855862651834028034	@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research	420	10

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
480	675354435921575936	https://pbs.twimg.com/ext_tw_video_thumb/675354114423808004/pu/img/qL1R_nGLqa6lmkOx.jpg	1	upright	0.303415	False	golden_retriever	0.181351	True	Brittany_spaniel	0.162084	True
1297	752309394570878976	https://pbs.twimg.com/ext_tw_video_thumb/675354114423808004/pu/img/qL1R_nGLqa6lmkOx.jpg	1	upright	0.303415	False	golden_retriever	0.181351	True	Brittany_spaniel	0.162084	True
1864	842892208864923648	https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg	1	Chihuahua	0.505370	True	Pomeranian	0.120358	True	toy_terrier	0.077008	True
1641	807106840509214720	https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg	1	Chihuahua	0.505370	True	Pomeranian	0.120358	True	toy_terrier	0.077008	True
1703	817181837579653120	https://pbs.twimg.com/ext_tw_video_thumb/815965888126062592/pu/img/JleSw4wRhgKDWQj5.jpg	1	Tibetan_mastiff	0.506312	True	Tibetan_terrier	0.295690	True	otterhound	0.036251	True

	tweet_id	timestamp	source	text	expanded_urls	name	doggo	floofer	pupper	puppo	rating
0	892420643555336193	2017-08-01 16:23:56	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU	https://twitter.com/dog_rates/status/892420643555336193/photo/1	Phineas	None	None	None	None	1.3
1	892177421306343426	2017-08-01 00:17:27	<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>	This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV	https://twitter.com/dog_rates/status/892177421306343426/photo/1	Tilly	None	None	None	None	1.3

	tweet_id	timestamp	source	text	expanded_urls	name	doggo	floofer	pupper	puppo	rating	gender
0	892420643555336193	2017-08-01 16:23:56	iPhone	This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU	https://twitter.com/dog_rates/status/892420643555336193/photo/1	Phineas	NaN	NaN	NaN	NaN	1.3	0.0
1	892177421306343426	2017-08-01 00:17:27	iPhone	This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV	https://twitter.com/dog_rates/status/892177421306343426/photo/1	Tilly	NaN	NaN	NaN	NaN	1.3	1.0
2	891815181378084864	2017-07-31 00:18:03	iPhone	This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB	https://twitter.com/dog_rates/status/891815181378084864/photo/1	Archie	NaN	NaN	NaN	NaN	1.2	0.0
3	891689557279858688	2017-07-30 15:58:51	iPhone	This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ	https://twitter.com/dog_rates/status/891689557279858688/photo/1	Darla	NaN	NaN	NaN	NaN	1.3	1.0
4	891327558926688256	2017-07-29 16:00:24	iPhone	This is Franklin. He would like you to stop calling him "cute." He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f	https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1	Franklin	NaN	NaN	NaN	NaN	1.2	0.0

	rating	gender	favorites	retweets	confidence	nof_images
count	2075.000000	1399.000000	2075.000000	2075.000000	1949.000000	1949.000000
mean	1.060627	0.247320	8906.798072	2774.800000	0.594315	1.202668
std	0.215384	0.431608	12519.998157	4708.667753	0.272293	0.560024
min	0.000000	0.000000	80.000000	13.000000	0.044333	1.000000
25%	1.000000	0.000000	1987.000000	617.500000	0.362596	1.000000
50%	1.100000	0.000000	4091.000000	1351.000000	0.587764	1.000000
75%	1.200000	0.000000	11203.500000	3185.000000	0.846986	1.000000
max	1.400000	1.000000	142993.000000	77130.000000	1.000000	4.000000

Data Wrangling Twitter¶

Table of Contents¶

Contact Information¶

Project Description¶

Gather¶

twitter_archive¶

image_predictions¶

twitter_counts¶

Assess¶

General¶

twitter_archive¶

Atticus¶

image_predictions¶

twitter_counts¶

Quality¶

twitter_archive¶

image_predictions¶

twitter_counts¶

flagged but not cleaned.¶

Tidiness:¶

Cleaning¶

archive_clean¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

image_clean¶

Define¶

Code¶

Test¶

Define¶

Code¶

Test¶

counts_clean¶

Define¶

Code¶

Test¶

Tidiness:¶

Define¶

Test¶

Store¶

Test¶

Analyzing and Visualizing Data¶

Data Distribution¶

Bivariate Analysis¶

Multivariate Analysis¶

Fun Analysis¶

Conclusions¶

Limitations¶

Summary¶