Lesson 4


Set working directory and read data:

setwd("/Users/ndvo/Dropbox/Udacity\ Nanodegree/P4_EDA/facebook/")

pf = read.delim('pseudo_facebook.tsv')
> head(pf)
   userid age dob_day dob_year dob_month gender tenure friend_count
1 2094382  14      19     1999        11   male    266            0
2 1192601  14       2     1999        11 female      6            0
3 2083884  14      16     1999        11   male     13            0
4 1203168  14      25     1999        12 female     93            0
5 1733186  14       4     1999        12   male     82            0
6 1524765  14       1     1999        12   male     15            0
  friendships_initiated likes likes_received mobile_likes mobile_likes_received
1                     0     0              0            0                     0
2                     0     0              0            0                     0
3                     0     0              0            0                     0
4                     0     0              0            0                     0
5                     0     0              0            0                     0
6                     0     0              0            0                     0
  www_likes www_likes_received
1         0                  0
2         0                  0
3         0                  0
4         0                  0
5         0                  0
6         0                  0
> str(pf)
'data.frame':   99003 obs. of  15 variables:
 $ userid               : int  2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
 $ age                  : int  14 14 14 14 14 14 13 13 13 13 ...
 $ dob_day              : int  19 2 16 25 4 1 14 4 1 2 ...
 $ dob_year             : int  1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
 $ dob_month            : int  11 11 11 12 12 12 1 1 1 2 ...
 $ gender               : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
 $ tenure               : int  266 6 13 93 82 15 12 0 81 171 ...
 $ friend_count         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ friendships_initiated: int  0 0 0 0 0 0 0 0 0 0 ...
 $ likes                : int  0 0 0 0 0 0 0 0 0 0 ...
 $ likes_received       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ mobile_likes         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ mobile_likes_received: int  0 0 0 0 0 0 0 0 0 0 ...
 $ www_likes            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ www_likes_received   : int  0 0 0 0 0 0 0 0 0 0 ...

Scatterplots and Perceived Audience Size

Notes:


Scatterplots

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point()

image


What are some things that you notice right away?

Response:


ggplot Syntax

Notes:


Overplotting

Notes:

# to find out min and max of pf$age
# summary(pf$age)
# 13.00   20.00   28.00   37.28   50.00  113.00 

# also, setting alpha = 1/20 - it takes 20 points to build to 'full' colour point.
# geom_jitter (instead of geom_point) - at random noise to disperse the vertical columns 
ggplot(aes(x = age, y = friend_count), data = pf) +
  geom_jitter(alpha = 1/20) + 
  #scale_y_log10() + 
  xlim(13, 90)

image

What do you notice in the plot?

Response:

Coord_trans()

Notes:
Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

# Dealing with jitter values < 0
#ggplot(aes(x = age, y = friend_count), data = pf[pf$friend_count>0,]) +
#  geom_jitter(alpha = 1/20) + 
#  coord_trans(ytrans = "sqrt") +
#  xlim(13, 90)

ggplot(aes(x = age, y = friend_count), data = pf) +
  geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) + 
  coord_trans(ytrans = "sqrt") +
  xlim(13, 90)

image

What do you notice?

There is more younger users of facebook (density of points for age < 30 is higher than for other ages)


Alpha and Jitter

Examine the relationship between friendships_initiated (y) and age (x) using the ggplot syntax.

We recommend creating a basic scatter plot first to see what the distribution looks like and then adjusting it by adding one layer at a time.

Notes:

p <- ggplot(aes(x = age, y = friendships_initiated), data = pf)

p1 <- p + geom_point() + theme(text = element_text(face= "bold",size = 25))

p2 <- p + geom_jitter() + theme(text = element_text(face= "bold",size = 25))

p3 <- p + geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
  scale_y_sqrt() +
  theme(text = element_text(face= "bold",size = 25)) +
  xlim(13, 90)
grid.arrange(p1, p2, p3, ncol = 1)

image


Overplotting and Domain Knowledge

Notes:


Conditional Means

More dplyr resources:

library(dplyr)

pf.fc_by_age <- pf %>%
                    select(age, friend_count) %>%
                    group_by(age) %>%
                    summarise(Mean_Friend_Count = mean(friend_count))
                    
                

image

Create your plot!

p_fc_by_age <- ggplot(aes(x = age, y = Mean_Friend_Count, fill = age), data = pf.fc_by_age)
p_fc_by_age + geom_bar(stat = 'identity') +
            scale_x_continuous(breaks= c(seq(15,120,5),13))

image

p_fc_by_age <- ggplot(aes(x = age, y = Mean_Friend_Count, fill = as.factor(age)), data = pf.fc_by_age)
p_fc_by_age + geom_bar(stat = 'identity') +
            scale_x_continuous(breaks= c(seq(15,120,5),13))

image

p_fc_by_age + geom_line() +
        scale_x_continuous(breaks= c(seq(15,120,5),13))

image


Overlaying Summaries with Raw Data

We take the jittered friendship graph as the base and add 4 summary overlays on top of it:

ggplot(aes(x = age, y = friend_count), data = pf) +
  #xlim(13, 90) +
  #coord_trans(ytrans = "sqrt") +
  coord_cartesian(xlim = c(13, 90), ylim=c(0, 1000)) +
  geom_point(alpha = 0.05, 
             position = position_jitter(h = 0),
             color = 'orange') + 
  geom_line(stat = 'summary', fun.y = mean, aes(color = 'Mean'), size = 1) +
  geom_line(stat = 'summary', fun.y = median, aes(color = 'Median'), size = 1) +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), linetype = 2, aes(color = '10th percentile'), size = 1) +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), linetype = 2, aes(color = '90th percentile'), size = 1) +
  scale_color_manual(values=c("black", "blue", "red", "dark green")) +
  theme(text = element_text(face= "bold",size = 25))

image

What are some of your observations of the plot?

Response:


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

image

cor.test(pf$age, pf$friend_count, method="pearson")

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places.
Response:
cor = -0.027

Results:
-0.3 < cor < 0.3 - no correlation

0.3 < cor < 0.5 - small correlation

0.5 < cor < 0.7 - moderate correlation

0.7 < cor - large correlation


Correlation on Subsets

Notes:

with(pf[pf$age<=70,] , cor.test(age, friend_count))

Correlation Methods

Notes: correlation numbers are useful but do not give as much insight as plotting the data.


Create Scatterplots

Notes:

# There are few significant outliers that extend the axis. Let's remove them as 1-2% top outliers. Where to cut?
# quantile(pf$likes_received, probs = c(.90, .95, .98))

ggplot(aes(x = www_likes_received, y = likes_received), data=pf) +
  geom_point(alpha = 0.05) +
  coord_cartesian(xlim = c(1, 600), ylim = c(1, 600)) +
  scale_x_sqrt() +
  scale_y_sqrt() + 
  # add correlation line
  geom_smooth(method = 'lm', color = 'red')

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

cor.test(pf$www_likes_received, pf$likes_received, method="pearson")

Response: 0.948


Moira on Correlation

Notes:


More Caution with Correlation

Notes:

#install.packages('alr3')
library(alr3)
data(Mitchell)

ggplot(aes(x = Month, y = Temp), data = Mitchell) +
  geom_point()

image


Noisy Scatterplots

a. Take a guess for the correlation coefficient for the scatterplot.

0.05

b. What is the actual correlation of the two variables?
(Round to the thousandths place)

cor.test(Mitchell$Month, Mitchell$Temp, method="pearson")

0.057


Making Sense of Data

Break up the x-axis in 12 months increments

ggplot(aes(x = Month, y = Temp), data = Mitchell) +
  geom_point() + 
  scale_x_continuous(breaks = seq(0, 203, 12))

image


A New Perspective

Zoom in into the plot, stretch y-axis and extend the x-axis.

What do you notice?

image

Response:

Every year, temperature follows the same pattern: starts with low values on January, goes up
and peaks around June, July. Then, the temperature symmetrically goes down to lowest values in December.
Function looks like first half of sinusoidal function.

Watch the solution video and check out the Instructor Notes!
Notes:

ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+ 
  geom_point() 

image


Understanding Noise: Age to Age Months

Notes: concrete values of means are just sample of the data and still contains some noise inside them.
The effect is event more visible if we count meanfriendcount / agebymonth.

ggplot(aes(x=age, y=friend_count_mean), data=pf.fc_by_age) +
  geom_line()
head(pf.fc_by_age, 10)
pf.fc_by_age[17:19,]

Age with Months Means

# Use December 31 as the reference date.
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)

Programming Assignment

pf.fc_by_age_months <- pf %>%
                          group_by(age_with_months) %>%
                          summarise(friend_count_mean = mean(friend_count),
                                    friend_count_median = median(friend_count),
                                     n = n()) %>%
                          arrange(age_with_months)
  
head(pf.fc_by_age_months)


Noise in Conditional Means

pf.fc_by_age_months[pf.fc_by_age_months$age_with_months<71,] %>%
ggplot(data = ., aes(x = age_with_months, y = friend_count_mean)) +
    geom_line() +
    theme(text = element_text(face= "bold",size = 25))

image


Smoothing Conditional Means

p1 <- subset(pf.fc_by_age, age < 71) %>%
        ggplot(aes(x = age, y = friend_count_mean), data = .) +
          geom_line() +
          theme(text = element_text(face= "bold",size = 25))



p2 <- pf.fc_by_age_months[pf.fc_by_age_months$age_with_months<71,] %>%
        ggplot(data = ., aes(x = age_with_months, y = friend_count_mean)) +
            geom_line() +
            theme(text = element_text(face= "bold",size = 25))

library(gridExtra)
grid.arrange(p2, p1, ncol = 1)

image

Notes:

Local Regression:

p1 <- subset(pf.fc_by_age, age < 71) %>%
        ggplot(aes(x = age, y = friend_count_mean), data = .) +
          geom_line() +
          geom_smooth() +
          theme(text = element_text(face= "bold",size = 20))



p2 <- pf.fc_by_age_months[pf.fc_by_age_months$age_with_months<71,] %>%
        ggplot(data = ., aes(x = age_with_months, y = friend_count_mean)) +
            geom_line() +
            geom_smooth() +
            theme(text = element_text(face= "bold",size = 20))

p3 <- subset(pf, age<71) %>%
        ggplot(aes(x = round(age / 5) * 5, y = friend_count), data = .) +
            geom_line(stat = 'summary', fun.y = mean) +
            theme(text = element_text(face= "bold",size = 20))


library(gridExtra)
grid.arrange(p2, p1, p3, ncol = 1)

image


Which Plot to Choose?

Notes:


Analyzing Two Variables

Reflection:

Extra notes:


Problems set 4

Price vs. x

# In this problem set, you'll continue
# to explore the diamonds data set.

# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.

library(ggplot2)
library(ggthemes)
theme_set(theme_minimal())
library(dplyr)
data(diamonds)

summary(diamonds$x)
diamonds %>%
    ggplot(aes(x = x, y = price), data = .) +
    geom_point(alpha = 0.05) +
    scale_x_continuous(limits = c(3.5,10),breaks = seq(3,10,0.5)) +
    theme(text = element_text(face= "bold",size = 20))

image

Observations:

Correlation

# correlation between price and x (0.88)
cor.test(diamonds$price, diamonds$x)

image

# correlation between price and y (0.87)
cor.test(diamonds$price, diamonds$y)

image

# correlation between price and z (0.86)
cor.test(diamonds$price, diamonds$z)

image

Price vs. depth

diamonds %>%
    ggplot(aes(x = depth, y = price), data = .) +
    geom_point(alpha = 0.1) +
    scale_x_continuous(limits = c(50,70),breaks = seq(50,70,1)) +
    theme(text = element_text(face= "bold",size = 20))

image

# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units.

diamonds %>%
    ggplot(aes(x = depth, y = price), data = .) +
    geom_point(alpha = 0.01) +
    scale_x_continuous(limits = c(50,70),breaks = seq(50,70,2)) +
    theme(text = element_text(face= "bold",size = 20))

image

Typical depth range: 58 to 64.

# What's the correlation of depth vs price? (-0.01)
cor.test(diamonds$depth, diamonds$price)

image

Would you use depth to predict price of a diamond: No
Why: low absolute value of correlation coefficient (next to 0) means there is no correlation between depth and price.
Diamonds of any depth can achieve both very low and very high price.

Price vs. carat

# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.
diamonds.top1p_excluded <- diamonds %>% 
    subset(diamonds$carat < quantile(diamonds$carat, .99) & 
            diamonds$price < quantile(diamonds$price, .99))

diamonds.top1p_excluded %>%
    ggplot(aes(x = carat, y = price), data = .) +
      geom_point() +
      theme(text = element_text(face= "bold",size = 20))

image

Price vs. volume

# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.

diamonds %>%
    ggplot(aes(x = volume, y = price), data = .) +
      geom_point(alpha = 0.05) +
      scale_x_log10(limits=c(50, 1000), breaks=c(50, 100, 200, 300, 400, 500, 700, 1000)) +
      theme(text = element_text(face= "bold",size = 20))

image

Observations:

Did not notice:

# What's the correlation of price and volume? (0.92)
# NOTE: instructor notes suggest removing outliers (vol > 800 && vol = 0)
cor.test(diamonds$price, diamonds$volume)

Adjustments - price vs. volume

# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot. (See the
# Instructor Notes or look up the documentation of
# geom_smooth() for more details about smoothers.)
require(mgcv)
diamonds.vol_nooutliers = diamonds[diamonds$volume < 800 & diamonds$volume != 0, ]
ggplot(aes(x = volume, y = price), data = diamonds.vol_nooutliers) +
  geom_point(alpha = 0.05) + 
  stat_smooth(method = 'lm', formula = y ~ x, aes(color = 'lm: y ~ x'), size = 1) +
  stat_smooth(method = 'lm', formula = y ~ poly(x, 2), aes(color = 'lm: y ~ poly(x, 2)'), size = 1) +
  stat_smooth(method = 'lm', formula = y ~ poly(x, 3), aes(color = 'lm: y ~ poly(x, 3)')) +
  stat_smooth(method = 'gam', formula = y ~ s(x), aes(color = 'gam: y ~ s(x)')) +
  scale_color_manual(values=2:5) +
  theme(text = element_text(face= "bold",size = 20))

image

Observations:

Mean price by clarity

Use the function dplyr package to create a new data frame containing info on diamonds by clarity.

Name the data frame diamondsByClarity

The data frame should contain the following variables in this order.

1. mean_price
2. median_price
3. min_price
4. max_price
5. n
diamondsByClarity = group_by(diamonds, clarity)
diamondsByClarity = summarise(diamondsByClarity,
                              mean_price = mean(price),
                              median_price = median(price),
                              min_price = min(price),
                              max_price = max(price),
                              n = n())
diamondsByClarity

image

Bar Charts of Mean Price

We’ve created summary data frames with the mean price
by clarity and color. You can run the code in R to
verify what data is in the variables diamonds_mp_by_clarity
and diamonds_mp_by_color.

Your task is to write additional code to create two bar plots
on one output image using the grid.arrange() function from the package
gridExtra.

flowigdata.com article on Bar Charts vs Histograms: http://flowingdata.com/2014/02/27/how-to-read-histograms-and-use-them-in-r/

data(diamonds)
library(dplyr)
library(gridExtra)

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

#diamonds_by_cut <- group_by(diamonds, cut)
#diamonds_mp_by_cut <- summarise(diamonds_by_cut, mean_price = mean(price))

p1 <- ggplot(aes(x = clarity, y = mean_price, fill = clarity),
        data = diamonds_mp_by_clarity) + geom_bar(stat='identity') +
         theme(text = element_text(face= "bold",size = 20))
p2 <- ggplot(aes(x = color, y = mean_price, fill = color),
        data = diamonds_mp_by_color) + geom_bar(stat='identity') +
         theme(text = element_text(face= "bold",size = 20))
#p3 <- ggplot(aes(x = cut, y = mean_price), data = diamonds_mp_by_cut) + geom_bar(stat='identity')
grid.arrange(p1, p2, ncol = 1)

image

Observations:

Gapminder Revisited

library(openxlsx)
library(tidyr)
fertility <- read.xlsx('indicator undata total_fertility.xlsx', rowNames = TRUE, rows=1:259)
fertility <- mutate(fertility, country=rownames(fertility))
fertility <- gather(fertility, year, "fertility", 1:(length(fertility)-1), convert=TRUE)
fertility <- arrange(fertility, country, year)

income <- read.xlsx('indicator gapminder gdp_per_capita_ppp.xlsx', rowNames = TRUE, rows=1:260)
income <- mutate(income, country=rownames(income))
income <- gather(income, year, "fertility", 1:(length(income)-1), convert = TRUE)
income <- arrange(income, country, year)

fer_inc <- inner_join(income, fertility, by = c("country", "year"))
names(fer_inc) <- c("country", "year", "income", "fertility")
head(fer_inc)

How did fertility income change in time?

summary(fer_inc$income)
ggplot(aes(x = year, y = income), data = fer_inc) +
  geom_point(alpha = 0.2, color = 'orange') +
  coord_cartesian(ylim=c(300, 190000)) +
  scale_y_log10() +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = median, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, probs = .1, linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, probs = .9, linetype = 2, color = 'blue')

fer_inc[!is.na(fer_inc$income) & fer_inc$income < 300,]

Observations:

How did income and fertility changed in time?

fer_inc.means <- fer_inc %>%
  group_by(year) %>%
  summarise(mean_income = mean(income, na.rm = TRUE),
            mean_fertility = mean(fertility, na.rm = TRUE),
            n = n()) %>%
  arrange(year)
head(fer_inc.means)
p1 <- ggplot(aes(x = year), data = fer_inc.means) +
  geom_point(aes(y = mean_income), color = 'red')
p2 <- ggplot(aes(x = year), data = fer_inc.means) +  
  geom_point(aes(y = mean_fertility), color = 'blue')
library(gridExtra)
grid.arrange(p1, p2, ncol=1)

cor.test(fer_inc$income, fer_inc$fertility)

Observations:




Lesson 5

Multivariate Data

Notes:


Moira Perceived Audience Size Colored by Age

Notes: the idea is that perceived audience size accuracy might change with age (older people being more accurate about the size).
Scatterplot with coloured points (cololur => age) didn’t show a clear answer.


Third Qualitative Variable

Notes:

library(ggplot2)
library(ggthemes)
theme_set(theme_minimal())
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
# Female  FB users are generally older than males.


pf %>% subset(!is.na(gender)) %>%
ggplot(data = ., aes(x = gender, y = age)) +
    geom_boxplot() +
    stat_summary(fun.y = mean, geom = "point", shape = 4) +
    theme(text = element_text(face= "bold",size = 20))

image

pf %>% subset(!is.na(gender)) %>%
ggplot(data = ., aes(x = age, y = friend_count)) +
    geom_line(aes(color = gender), stat = 'summary', fun.y = median) +
    theme(text = element_text(face= "bold",size = 20))

image

Excercise

Write code to create a new data frame, called ‘pf.fc_by_age_gender’, that contains
information on each age AND gender group.

The data frame should contain the following variables:

library(dplyr)
pf.fc_by_age_gender <- 
    pf %>%
      filter(!is.na(gender)) %>%
      group_by(age, gender) %>%
      summarise(mean_friend_count = mean(friend_count),
                median_friend_count = median(friend_count),
                n = n()) %>%
      ungroup() %>%
      arrange(age)

pf.fc_by_age_gender

image


Plotting Conditional Summaries

Notes: with the conditional summaries inside dataframe, we no longer need to use ‘stat’ and ‘fun.y’ parameters to geom_line.


pf.fc_by_age_gender %>%
ggplot(data = ., aes(x = age, y = median_friend_count)) +
    geom_line(aes(color = gender)) +
    theme(text = element_text(face= "bold",size = 20))

image


Thinking in Ratios

Notes: charts shows that friend count difference is larger for younger users.
We’d like to see a ratio: how many more friends do girls have. Does the ratio hold in time?


Wide and Long Format

Notes:

Long format: (age, gender) -> (observations)

Wide format: (age) -> (observation for male, observation for female)


Reshaping Data

Notes:

library(tidyr)
library(reshape2)

# rehape with tidyr (note 'spread' and 'mutate')
pf.fc_by_age_gender.wide <-
  pf.fc_by_age_gender %>%
  select(age, gender, median_friend_count) %>%
  subset(!is.na(gender)) %>% 
    spread(gender, median_friend_count) %>% 
    mutate(ratio = male / female) 
head(pf.fc_by_age_gender.wide)

image

pf.fc_by_age_gender.wide <- 
    pf.fc_by_age_gender %>%
        dcast(age ~ gender, value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide)

image


Ratio Plot

Notes:

# Plot the ratio of the female to male median friend counts 
# using the data frame pf.fc_by_age_gender.wide.

# Think about what geom you should use.
# Add a horizontal line to the plot with a y intercept of 1, which will be the base line. 
# Look up the documentation for geom_hline to do that. Use the parameter linetype in geom_hline to make the line dashed.

# The linetype parameter can take the values 0-6:
# 0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash
pf.fc_by_age_gender.wide %>%
    ggplot(aes(x = age, y = female / male)) +
        geom_line(color = '#02a0e2') +
        geom_hline(yintercept = 1, color = 'red', linetype = 2) +
        theme(text = element_text(face= "bold",size = 20))

image


Third Quantitative Variable

Notes: from the graph above, female have more friends than males (> 2 times more for young users).
Hypothesis: there is lots of male who are new-joiners and haven't build their friends list yet.
We want to analyze the ratio based on account age (tenure).

pf$year_joined <- 2014 - ceiling(pf$tenure / 365)
# optionally:  pf$year_joined <- floor(2014 - pf$tenure / 365)

Cut a Variable

Notes: there are not many users for 2005 and 2006, so we want to group the early years users together.

table(pf$year_joined)
pf$year_joined.bucket <- cut(pf$year_joined, breaks = c(2004, 2009, 2011, 2012, 2014))
summary(pf$year_joined.bucket)

image

image


Plotting it All Together

Notes: plot belows confirms the thesies: user who are longer of facebook (bigger tenure) have more friends.

# Create a line graph of friend_count vs. age so that each year_joined.bucket is a line
# tracking the median user friend_count across age. This means you should have four different
# lines on your plot.

# You should subset the data to exclude the users whose year_joined.bucket is NA.
pf %>%
    subset(!is.na(year_joined.bucket)) %>%
    ggplot(data = ., aes(x = age, y = friend_count)) +
        geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median) +
        theme(text = element_text(face= "bold",size = 20))

image


Plot the Grand Mean

Notes: grand mean is low (below the [2011-2012] cohort), which means that large number of users are in the last cohort
and bring the grand mean down.

# Write code to do the following:

# (1) Add another geom_line to code below to plot the grand mean of the friend count vs age.
# (2) Exclude any users whose year_joined.bucket is NA.
# (3) Use a different line type for the grand mean.

# As a reminder, the parameter linetype can take the values 0-6:
# 0 = blank, 1 = solid, 2 = dashed, # 3 = dotted, 4 = dotdash, 5 = longdash, # 6 = twodash
pf %>%
    subset(!is.na(year_joined.bucket)) %>%
    ggplot(data = ., aes(x = age, y = friend_count)) +
        geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
        geom_line(stat = "summary", fun.y = mean, color = 'blue', alpha = 0.5, linetype = 'dashed')+
        theme(text = element_text(face= "bold",size = 20))

image


Friending Rate

Notes:

friend_rate <- subset(pf, tenure >= 1)
friend_rate <- friend_rate$friend_count / friend_rate$tenure
summary(friend_rate)
# alt
# with(subset(pf, tenure >= 1), summary(friend_count / tenure))

image


Friendships Initiated

Notes:

What is the median friend rate?

0.2205

What is the maximum friend rate?

417.0000

# Create a line graph of mean of friendships_initiated per day (of tenure)
# vs. tenure colored by year_joined.bucket.

# You need to make use of the variables tenure, friendships_initiated, and year_joined.bucket.

pf %>%
subset(tenure >= 1) %>%
    ggplot(data = ., aes(x = tenure, y = friendships_initiated / tenure)) +
        geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
        theme(text = element_text(face= "bold",size = 20))

image

pf %>%
subset(tenure >= 1) %>%
    ggplot(data = ., aes(x = tenure, y = friendships_initiated / tenure)) +
        geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
        scale_y_sqrt() +
        theme(text = element_text(face= "bold",size = 20))

image


Bias-Variance Tradeoff Revisited

Notes: understanding bias vs variance: http://scott.fortmann-roe.com/docs/BiasVariance.html

Books: The Elements of Statistical Learning: http://statweb.stanford.edu/~tibs/ElemStatLearn/

pf %>%
subset(tenure >= 1) %>%
    ggplot(data = ., aes(x = tenure, y = friendships_initiated / tenure)) +
        geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
        theme(text = element_text(face= "bold",size = 20))

image

pf %>%
subset(tenure > 0) %>%
    ggplot(data = ., aes(x = 7 * round(tenure / 7),
                     y = friendships_initiated / tenure)) +
        geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
        theme(text = element_text(face= "bold",size = 20))

image

pf %>%
subset(tenure > 0) %>%
    ggplot(data = ., aes(x = 30 * round(tenure / 30),
                     y = friendships_initiated / tenure)) +
        geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
        theme(text = element_text(face= "bold",size = 20))

image

pf %>%
subset(tenure > 0) %>%
    ggplot(data = ., aes(x = 90 * round(tenure / 90),
                     y = friendships_initiated / tenure)) +
        geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
        theme(text = element_text(face= "bold",size = 20))

image

# finally, we can use geom_smooth to add smoother to the plot
pf %>%
subset(tenure > 0) %>%
    ggplot(data = ., aes(x = 7 * round(tenure / 7),
                     y = friendships_initiated / tenure)) +
        geom_smooth(aes(color = year_joined.bucket)) +
        theme(text = element_text(face= "bold",size = 20))

image


Sean’s NFL Fan Sentiment Study

Links:

Notes:


Introducing the Yogurt Data Set

Notes: yogurt dataset contains multiple rows per household; one per each transaction.

yo <- read.csv('/Users/ndvo/Dropbox/udacity-data-science-master/p4/l5/yogurt.csv')

yo %>% str
yo %>% head

image


Histograms Revisited

Notes:

# Change the id from an int to a factor
yo$id <- factor(yo$id)
yo$id %>% str

yo %>%
    ggplot(data = ., aes(x= yo$price)) +
        geom_histogram() +
        theme(text = element_text(face= "bold",size = 20))

image

image

What do you notice?:


Number of Purchases

Notes:

image

summary(yo$price)
unique(yo$price)

# Create a new variable called all.purchases, which gives the total counts of yogurt for
# each observation or household.
yo$all.purchases <- rowSums(yo[,c('strawberry', 'blueberry', 'pina.colada', 'plain', 'mixed.berry')])

Prices over Time

Notes:

yo %>%
    ggplot(aes(x = time, y = price), data = .) +
        geom_point(alpha = 0.1, color = 'blue') +
        theme(text = element_text(face= "bold",size = 20))

image


Sampling Observations

Notes: when beginning with the dataset it’s often good to look at sample of the data.

In the yogurt dataset, it’s best to take a look at particular households to see variance within the household
and between the households.


Looking at Samples of Households

# set the seed for reproducible results
set.seed(4000)
sample.ids <- sample(levels(yo$id), 16)

yo %>%
subset(id %in% sample.ids) %>%
    ggplot(aes(x = time, y = price), data = .) +
        facet_wrap(~ id) +
        geom_line(aes(color = id)) +
                guides(color = FALSE) +
        geom_point(aes(size = all.purchases), pch = 1) +
        theme(text = element_text(face= "bold",size = 15))

image

Notes:

Seed 4000:


The Limits of Cross Sectional Data

Notes: if we have observations over time, we can facet them using primary id (household id)

This cannot be done with FB data, because it’s snapshot of friendship counts at one moment in time.


Many Variables

Notes: much of the analysis so far was focused on some pre-chosen variable, pair of variables, etc.

But we might:

We can speed up the analysis, by producing many plots & comparisons at once.


Scatterplot Matrix

library(GGally)
theme_set(theme_minimal(20))

# set the seed for reproducible results
set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ], axisLabels = 'internal',mapping=ggplot2::aes(colour = gender))

image

Notes:


Even More Variables

Notes:


Heat Maps

Notes:

melting data: http://www.r-bloggers.com/melt/

nci <- read.table("/Users/ndvo/Dropbox/udacity-data-science-master/p4/l5/nci.tsv")
# changing the colnames to produce a nicer plot
nci %>% dim
colnames(nci) <- c(1:64)

nci[1:200,] %>% as.matrix %>% str
nci[1:200,] %>% as.matrix %>% melt %>% str
nci[1:200,] %>% as.matrix %>% melt %>% head

nci.long.samp <- nci[1:200,] %>% as.matrix %>% melt
names(nci.long.samp) <- c("gene", "case", "value")
nci.long.samp %>% head

image

nci.long.samp %>%
    ggplot(aes(x = case, y = gene, fill = value), data = .) +
        geom_tile() +
        scale_fill_gradientn(colors = colorRampPalette(c("blue","red"))(100)) +
        theme(text = element_text(face= "bold",size = 20))

image


Analyzing Three of More Variables

Reflection:


Problems set 5

Price Histograms with Facet and Color

library(ggplot2)
library(ggthemes)
theme_set(theme_minimal())
library(dplyr)
data(diamonds)

# Create a histogram of diamond prices.
# Facet the histogram by diamond color and use cut to color the histogram bars.

# The plot should look something like this. http://i.imgur.com/b5xyrOu.jpg

# Note: In the link, a color palette of type 'qual' was used to color the histogram using
# scale_fill_brewer(type = 'qual')

diamonds %>%
    ggplot(aes(x= price), data = .) +
    geom_histogram(aes(fill = cut), position = 'stack') +
    facet_wrap(~ color) +
    scale_fill_brewer("cut", type = 'qual', palette = "Set1") +
    theme(text = element_text(face= "bold",size = 20))

image

Observations:

Price vs. Table Colored by Cut

# Create a scatterplot of diamond price vs. table and color the points by the cut of
# the diamond.

# The plot should look something like this.
# http://i.imgur.com/rQF9jQr.jpg


diamonds %>%
    ggplot(aes(x = table, y = price), data = .) +
      geom_point(aes(color = cut), size = 3) +
      # add scale and limits to see typical range easily
      scale_x_continuous(limits = c(50, 70), breaks=seq(50, 70, 1)) +
      scale_color_brewer("cut", type = 'qual', palette = 1) +
      theme(text = element_text(face= "bold",size = 20))

image

Observations:

Typical table value

What is the typical table range for the majority of diamonds of IDEAL cut? 53-57

What is the typical table range for the majority of diamonds of PREMIUM cut? 58-62

Price vs. Volume and Diamond Clarity

# Create a scatterplot of diamond price vs. volume (x * y * z) and color the points by
# the clarity of diamonds. 
# Use scale on the y-axis to take the log10 of price. You should also omit the top 1% of diamond volumes from the plot.

# Note: Volume is a very rough approximation of a diamond's actual volume.

# The plot should look something like this. http://i.imgur.com/excUpea.jpg

# Note: In the link, a color palette of type 'div' was used to color the scatterplot using
# scale_color_brewer(type = 'div')

diamonds$volume = diamonds$x * diamonds$y * diamonds$z

diamonds %>%
subset(volume < quantile(diamonds$volume, 0.99)) %>%
    ggplot(aes(x = volume, y = price), data = .) +
      geom_point(aes(color = clarity)) +
      scale_y_log10() +
      scale_color_brewer(type = 'div')  +
      theme(text = element_text(face= "bold",size = 20))

image

Observations:

Proportion of Friendships Initiated

Many interesting variables are derived from two or more others.

For example, we might wonder how much of a person’s network on a service like Facebook the user actively initiated. Two users
with the same degree (or number of friends) might be very different if one initiated most of those connections on the
service, while the other initiated very few. So it could be useful to consider this proportion of existing friendships that
the user initiated. This might be a good predictor of how active a user is compared with their peers, or other traits, such as
personality (i.e., is this person an extrovert?).

Your task is to create a new variable called ‘prop_initiated’ in the Pseudo-Facebook data set. The variable should contain
the proportion of friendships that the user initiated.

pf = read.delim('pseudo_facebook.tsv')
pf$prop_initiated = pf$friendships_initiated / pf$friend_count

prop_initiated vs. tenure

# Create a line graph of 
# the median proportion of friendships initiated ('prop_initiated') vs. tenure 
# and color the line segment by year_joined.bucket.

# Recall, we created year_joined.bucket in Lesson 5
# by first creating year_joined from the variable tenure.
# Then, we used the cut function on year_joined to create
# four bins or cohorts of users.

# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]

# The plot should look something like this.
# http://i.imgur.com/vNjPtDh.jpg
# OR this
# http://i.imgur.com/IBN1ufQ.jpg

pf$year_joined <- 2014 - ceiling(pf$tenure / 365)
pf$year_joined.bucket <- cut(pf$year_joined, breaks = c(2004, 2009, 2011, 2012, 2014))

pf %>%
    ggplot(aes(x= tenure, y = prop_initiated), data = .) +
    geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median) +
    theme(text = element_text(face= "bold",size = 20))
    

image

Observations:

Smoothing prop_initiated vs. tenure

# Smooth the last plot you created of of prop_initiated vs tenure colored by
# year_joined.bucket. 
# You can bin together ranges of tenure or add a smoother to the plot.

pf %>%
    ggplot(aes(x = 30 * round(tenure / 30), y = prop_initiated), data = .) +
        geom_line(aes(color = year_joined.bucket), 
                    stat = 'summary', fun.y = median) +
        theme(text = element_text(face= "bold",size = 20))

image

pf %>%
    ggplot(aes(tenure, y = prop_initiated), data = .) +
        geom_smooth(aes(color = year_joined.bucket)) +
        theme(text = element_text(face= "bold",size = 20))

image

For the group with the largest propotion of the friendships initiated, what is the group’s average (mean)
proportion of friendships initiaited?

with(subset(pf, pf$year_joined.bucket == "(2012,2014]"), table(year_joined.bucket))

with(subset(pf, year_joined.bucket == "(2012,2014]"), mean(prop_initiated, na.rm=TRUE))

image

Price/Carat Binned, Faceted, & Colored

# Create a scatter plot of the price/carat ratio of diamonds. 
# The variable x should be assigned to cut. 
# The points should be colored by diamond color, 
# and the plot should be faceted by clarity.

# The plot should look something like this.
# http://i.imgur.com/YzbWkHT.jpg.

# Note: In the link, a color palette of type 'div' was used to color the histogram using
# scale_color_brewer(type = 'div')

diamonds %>%
    ggplot(aes(x = cut, y = price/carat), data = .) +
      # use geom_jitter to spread the values inside single color bucket
      geom_jitter(aes(color = color), alpha = 1, position = position_jitter(h = 0)) +
      facet_wrap(~ clarity) +
      scale_color_brewer(type = 'div') +
      theme(text = element_text(face= "bold",size = 12))

image

Observations:

Gapminder Multivariate Analysis

Gapminder data: http://www.gapminder.org/data/

In your investigation, examine 3 or more variables and create 2-5 plots that make use of the techniques from Lesson 5.

# scatter/line plot of GDP and education level of population, male and female
library(openxlsx)
library(tidyr)

literacy_total <- read.xlsx('/Users/ndvo/Dropbox/udacity-data-science-master/p4/ps5/indicator SE_ADT_LITR_ZS.xlsx', rowNames = TRUE)
literacy_total %>% head
literacy_total %>% str
literacy_total <- mutate(literacy_total, country = rownames(literacy_total))
literacy_total <- gather(literacy_total, year, "literacy.total", 1:(length(literacy_total)-1), convert=TRUE)
literacy_total <- arrange(literacy_total, country, year)
literacy_total <- subset(literacy_total, !is.na(literacy_total$literacy.total))
literacy_total %>% head
literacy_total %>% str

image

image

image

image

literacy_male = read.xlsx('/Users/ndvo/Dropbox/udacity-data-science-master/p4/ps5/indicator SE_ADT_LITR_MA_ZS.xlsx', rowNames = TRUE)
literacy_male %>% head
literacy_male %>% str
literacy_male <- mutate(literacy_male, country = rownames(literacy_male))
literacy_male <- gather(literacy_male, year, "literacy.male", 1:(length(literacy_male)-1), convert=TRUE)
literacy_male <- arrange(literacy_male, country, year)
literacy_male <- subset(literacy_male, !is.na(literacy_male$literacy.male))
literacy_male %>% head
literacy_male %>% str

image

literacy_female = read.xlsx('/Users/ndvo/Dropbox/udacity-data-science-master/p4/ps5/indicator SE_ADT_LITR_FE_ZS.xlsx', rowNames = TRUE)
literacy_female %>% head
literacy_female %>% str
literacy_female <- mutate(literacy_female, country = rownames(literacy_female))
literacy_female <- gather(literacy_female, year, "literacy.female", 1:(length(literacy_female)-1), convert=TRUE)
literacy_female <- arrange(literacy_female, country, year)
literacy_female <- subset(literacy_female, !is.na(literacy_female$literacy.female))
literacy_female %>% head
literacy_female %>% str

image

income <- read.xlsx('/Users/ndvo/Dropbox/udacity-data-science-master/p4/ps5/indicator gapminder gdp_per_capita_ppp.xlsx', rowNames = TRUE, rows=1:260)
income %>% head
income %>% str
income <- mutate(income, country=rownames(income))
income <- gather(income, year, "income", 1:(length(income)-1), convert = TRUE)
income <- arrange(income, country, year)
income %>% head
income %>% str

image

lit_inc <- inner_join(income, literacy_total, by = c("country", "year"))
lit_inc <- inner_join(lit_inc, literacy_male, by = c("country", "year"))
lit_inc <- inner_join(lit_inc, literacy_female, by = c("country", "year"))
lit_inc$country.fact = factor(lit_inc$country)
lit_inc %>% head
lit_inc %>% str

image

lit_inc2 <- gather(lit_inc, "group", "literacy", 4:6)
lit_inc2$group.fact = factor(lit_inc2$group)
lit_inc2 %>% head
lit_inc2 %>% str

image

lit_inc2 %>%
    ggplot(aes(x = round(income/2000)*2000, y = literacy), data = .) +
        geom_line(aes(color = group.fact), stat = 'summary', fun.y = mean) +
        scale_x_log10() +
        theme(text = element_text(face= "bold",size = 20))

image

Observations:

set.seed(4000)
sample.countries <- sample(levels(lit_inc$country.fact), 16)

lit_inc %>%
subset(country %in% sample.countries) %>%
    ggplot(aes(x = literacy.total, y = income, color = country.fact), data = .) +
      geom_point(size = 3) +
      scale_y_log10() +
      facet_wrap(~ country.fact)+
      theme(text = element_text(face= "bold",size = 18)) +
      guides(color = FALSE)

image

Observations: