Set working directory and read data:
setwd("/Users/ndvo/Dropbox/Udacity\ Nanodegree/P4_EDA/facebook/")
pf = read.delim('pseudo_facebook.tsv')
> head(pf)
userid age dob_day dob_year dob_month gender tenure friend_count
1 2094382 14 19 1999 11 male 266 0
2 1192601 14 2 1999 11 female 6 0
3 2083884 14 16 1999 11 male 13 0
4 1203168 14 25 1999 12 female 93 0
5 1733186 14 4 1999 12 male 82 0
6 1524765 14 1 1999 12 male 15 0
friendships_initiated likes likes_received mobile_likes mobile_likes_received
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
5 0 0 0 0 0
6 0 0 0 0 0
www_likes www_likes_received
1 0 0
2 0 0
3 0 0
4 0 0
5 0 0
6 0 0
> str(pf)
'data.frame': 99003 obs. of 15 variables:
$ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
$ age : int 14 14 14 14 14 14 13 13 13 13 ...
$ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
$ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
$ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
$ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
$ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
$ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
$ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
$ likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
$ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
$ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
Notes:
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point()
Response:
Notes:
Notes:
# to find out min and max of pf$age
# summary(pf$age)
# 13.00 20.00 28.00 37.28 50.00 113.00
# also, setting alpha = 1/20 - it takes 20 points to build to 'full' colour point.
# geom_jitter (instead of geom_point) - at random noise to disperse the vertical columns
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
#scale_y_log10() +
xlim(13, 90)
Response:
Notes:
Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!
# Dealing with jitter values < 0
#ggplot(aes(x = age, y = friend_count), data = pf[pf$friend_count>0,]) +
# geom_jitter(alpha = 1/20) +
# coord_trans(ytrans = "sqrt") +
# xlim(13, 90)
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
coord_trans(ytrans = "sqrt") +
xlim(13, 90)
There is more younger users of facebook (density of points for age < 30 is higher than for other ages)
Examine the relationship between friendships_initiated (y) and age (x) using the ggplot syntax.
We recommend creating a basic scatter plot first to see what the distribution looks like and then adjusting it by adding one layer at a time.
Notes:
p <- ggplot(aes(x = age, y = friendships_initiated), data = pf)
p1 <- p + geom_point() + theme(text = element_text(face= "bold",size = 25))
p2 <- p + geom_jitter() + theme(text = element_text(face= "bold",size = 25))
p3 <- p + geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
scale_y_sqrt() +
theme(text = element_text(face= "bold",size = 25)) +
xlim(13, 90)
grid.arrange(p1, p2, p3, ncol = 1)
Notes:
More dplyr resources:
dplyrtalk:
library(dplyr)
pf.fc_by_age <- pf %>%
select(age, friend_count) %>%
group_by(age) %>%
summarise(Mean_Friend_Count = mean(friend_count))
Create your plot!
p_fc_by_age <- ggplot(aes(x = age, y = Mean_Friend_Count, fill = age), data = pf.fc_by_age)
p_fc_by_age + geom_bar(stat = 'identity') +
scale_x_continuous(breaks= c(seq(15,120,5),13))
p_fc_by_age <- ggplot(aes(x = age, y = Mean_Friend_Count, fill = as.factor(age)), data = pf.fc_by_age)
p_fc_by_age + geom_bar(stat = 'identity') +
scale_x_continuous(breaks= c(seq(15,120,5),13))
p_fc_by_age + geom_line() +
scale_x_continuous(breaks= c(seq(15,120,5),13))
We take the jittered friendship graph as the base and add 4 summary overlays on top of it:
ggplot(aes(x = age, y = friend_count), data = pf) +
#xlim(13, 90) +
#coord_trans(ytrans = "sqrt") +
coord_cartesian(xlim = c(13, 90), ylim=c(0, 1000)) +
geom_point(alpha = 0.05,
position = position_jitter(h = 0),
color = 'orange') +
geom_line(stat = 'summary', fun.y = mean, aes(color = 'Mean'), size = 1) +
geom_line(stat = 'summary', fun.y = median, aes(color = 'Median'), size = 1) +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), linetype = 2, aes(color = '10th percentile'), size = 1) +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), linetype = 2, aes(color = '90th percentile'), size = 1) +
scale_color_manual(values=c("black", "blue", "red", "dark green")) +
theme(text = element_text(face= "bold",size = 25))
Response:
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor.test(pf$age, pf$friend_count, method="pearson")
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places.
Response:
cor = -0.027
Results:
-0.3 < cor < 0.3 - no correlation
0.3 < cor < 0.5 - small correlation
0.5 < cor < 0.7 - moderate correlation
0.7 < cor - large correlation
Notes:
with(pf[pf$age<=70,] , cor.test(age, friend_count))
Notes: correlation numbers are useful but do not give as much insight as plotting the data.
Notes:
# There are few significant outliers that extend the axis. Let's remove them as 1-2% top outliers. Where to cut?
# quantile(pf$likes_received, probs = c(.90, .95, .98))
ggplot(aes(x = www_likes_received, y = likes_received), data=pf) +
geom_point(alpha = 0.05) +
coord_cartesian(xlim = c(1, 600), ylim = c(1, 600)) +
scale_x_sqrt() +
scale_y_sqrt() +
# add correlation line
geom_smooth(method = 'lm', color = 'red')
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
cor.test(pf$www_likes_received, pf$likes_received, method="pearson")
Response: 0.948
Notes:
Notes:
#install.packages('alr3')
library(alr3)
data(Mitchell)
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point()
a. Take a guess for the correlation coefficient for the scatterplot.
0.05
b. What is the actual correlation of the two variables?
(Round to the thousandths place)
cor.test(Mitchell$Month, Mitchell$Temp, method="pearson")
0.057
Break up the x-axis in 12 months increments
ggplot(aes(x = Month, y = Temp), data = Mitchell) +
geom_point() +
scale_x_continuous(breaks = seq(0, 203, 12))
Zoom in into the plot, stretch y-axis and extend the x-axis.
What do you notice?
Response:
Every year, temperature follows the same pattern: starts with low values on January, goes up
and peaks around June, July. Then, the temperature symmetrically goes down to lowest values in December.
Function looks like first half of sinusoidal function.
Watch the solution video and check out the Instructor Notes!
Notes:
ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+
geom_point()
Notes: concrete values of means are just sample of the data and still contains some noise inside them.
The effect is event more visible if we count meanfriendcount / agebymonth.
ggplot(aes(x=age, y=friend_count_mean), data=pf.fc_by_age) +
geom_line()
head(pf.fc_by_age, 10)
pf.fc_by_age[17:19,]
# Use December 31 as the reference date.
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
Programming Assignment
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months)
pf.fc_by_age_months[pf.fc_by_age_months$age_with_months<71,] %>%
ggplot(data = ., aes(x = age_with_months, y = friend_count_mean)) +
geom_line() +
theme(text = element_text(face= "bold",size = 25))
p1 <- subset(pf.fc_by_age, age < 71) %>%
ggplot(aes(x = age, y = friend_count_mean), data = .) +
geom_line() +
theme(text = element_text(face= "bold",size = 25))
p2 <- pf.fc_by_age_months[pf.fc_by_age_months$age_with_months<71,] %>%
ggplot(data = ., aes(x = age_with_months, y = friend_count_mean)) +
geom_line() +
theme(text = element_text(face= "bold",size = 25))
library(gridExtra)
grid.arrange(p2, p1, ncol = 1)
Notes:
Local Regression:
p1 <- subset(pf.fc_by_age, age < 71) %>%
ggplot(aes(x = age, y = friend_count_mean), data = .) +
geom_line() +
geom_smooth() +
theme(text = element_text(face= "bold",size = 20))
p2 <- pf.fc_by_age_months[pf.fc_by_age_months$age_with_months<71,] %>%
ggplot(data = ., aes(x = age_with_months, y = friend_count_mean)) +
geom_line() +
geom_smooth() +
theme(text = element_text(face= "bold",size = 20))
p3 <- subset(pf, age<71) %>%
ggplot(aes(x = round(age / 5) * 5, y = friend_count), data = .) +
geom_line(stat = 'summary', fun.y = mean) +
theme(text = element_text(face= "bold",size = 20))
library(gridExtra)
grid.arrange(p2, p1, p3, ncol = 1)
Notes:
Reflection:
Extra notes:
# In this problem set, you'll continue
# to explore the diamonds data set.
# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.
library(ggplot2)
library(ggthemes)
theme_set(theme_minimal())
library(dplyr)
data(diamonds)
summary(diamonds$x)
diamonds %>%
ggplot(aes(x = x, y = price), data = .) +
geom_point(alpha = 0.05) +
scale_x_continuous(limits = c(3.5,10),breaks = seq(3,10,0.5)) +
theme(text = element_text(face= "bold",size = 20))
Observations:
# correlation between price and x (0.88)
cor.test(diamonds$price, diamonds$x)
# correlation between price and y (0.87)
cor.test(diamonds$price, diamonds$y)
# correlation between price and z (0.86)
cor.test(diamonds$price, diamonds$z)
diamonds %>%
ggplot(aes(x = depth, y = price), data = .) +
geom_point(alpha = 0.1) +
scale_x_continuous(limits = c(50,70),breaks = seq(50,70,1)) +
theme(text = element_text(face= "bold",size = 20))
# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units.
diamonds %>%
ggplot(aes(x = depth, y = price), data = .) +
geom_point(alpha = 0.01) +
scale_x_continuous(limits = c(50,70),breaks = seq(50,70,2)) +
theme(text = element_text(face= "bold",size = 20))
Typical depth range: 58 to 64.
# What's the correlation of depth vs price? (-0.01)
cor.test(diamonds$depth, diamonds$price)
Would you use depth to predict price of a diamond: No
Why: low absolute value of correlation coefficient (next to 0) means there is no correlation between depth and price.
Diamonds of any depth can achieve both very low and very high price.
# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.
diamonds.top1p_excluded <- diamonds %>%
subset(diamonds$carat < quantile(diamonds$carat, .99) &
diamonds$price < quantile(diamonds$price, .99))
diamonds.top1p_excluded %>%
ggplot(aes(x = carat, y = price), data = .) +
geom_point() +
theme(text = element_text(face= "bold",size = 20))
# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.
diamonds %>%
ggplot(aes(x = volume, y = price), data = .) +
geom_point(alpha = 0.05) +
scale_x_log10(limits=c(50, 1000), breaks=c(50, 100, 200, 300, 400, 500, 700, 1000)) +
theme(text = element_text(face= "bold",size = 20))
Observations:
Did not notice:
# What's the correlation of price and volume? (0.92)
# NOTE: instructor notes suggest removing outliers (vol > 800 && vol = 0)
cor.test(diamonds$price, diamonds$volume)
# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot. (See the
# Instructor Notes or look up the documentation of
# geom_smooth() for more details about smoothers.)
require(mgcv)
diamonds.vol_nooutliers = diamonds[diamonds$volume < 800 & diamonds$volume != 0, ]
ggplot(aes(x = volume, y = price), data = diamonds.vol_nooutliers) +
geom_point(alpha = 0.05) +
stat_smooth(method = 'lm', formula = y ~ x, aes(color = 'lm: y ~ x'), size = 1) +
stat_smooth(method = 'lm', formula = y ~ poly(x, 2), aes(color = 'lm: y ~ poly(x, 2)'), size = 1) +
stat_smooth(method = 'lm', formula = y ~ poly(x, 3), aes(color = 'lm: y ~ poly(x, 3)')) +
stat_smooth(method = 'gam', formula = y ~ s(x), aes(color = 'gam: y ~ s(x)')) +
scale_color_manual(values=2:5) +
theme(text = element_text(face= "bold",size = 20))
Observations:
Use the function dplyr package to create a new data frame containing info on diamonds by clarity.
Name the data frame diamondsByClarity
The data frame should contain the following variables in this order.
1. mean_price
2. median_price
3. min_price
4. max_price
5. n
diamondsByClarity = group_by(diamonds, clarity)
diamondsByClarity = summarise(diamondsByClarity,
mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n = n())
diamondsByClarity
We’ve created summary data frames with the mean price
by clarity and color. You can run the code in R to
verify what data is in the variables diamonds_mp_by_clarity
and diamonds_mp_by_color.
Your task is to write additional code to create two bar plots
on one output image using the grid.arrange() function from the package
gridExtra.
flowigdata.com article on Bar Charts vs Histograms: http://flowingdata.com/2014/02/27/how-to-read-histograms-and-use-them-in-r/
data(diamonds)
library(dplyr)
library(gridExtra)
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
#diamonds_by_cut <- group_by(diamonds, cut)
#diamonds_mp_by_cut <- summarise(diamonds_by_cut, mean_price = mean(price))
p1 <- ggplot(aes(x = clarity, y = mean_price, fill = clarity),
data = diamonds_mp_by_clarity) + geom_bar(stat='identity') +
theme(text = element_text(face= "bold",size = 20))
p2 <- ggplot(aes(x = color, y = mean_price, fill = color),
data = diamonds_mp_by_color) + geom_bar(stat='identity') +
theme(text = element_text(face= "bold",size = 20))
#p3 <- ggplot(aes(x = cut, y = mean_price), data = diamonds_mp_by_cut) + geom_bar(stat='identity')
grid.arrange(p1, p2, ncol = 1)
Observations:
library(openxlsx)
library(tidyr)
fertility <- read.xlsx('indicator undata total_fertility.xlsx', rowNames = TRUE, rows=1:259)
fertility <- mutate(fertility, country=rownames(fertility))
fertility <- gather(fertility, year, "fertility", 1:(length(fertility)-1), convert=TRUE)
fertility <- arrange(fertility, country, year)
income <- read.xlsx('indicator gapminder gdp_per_capita_ppp.xlsx', rowNames = TRUE, rows=1:260)
income <- mutate(income, country=rownames(income))
income <- gather(income, year, "fertility", 1:(length(income)-1), convert = TRUE)
income <- arrange(income, country, year)
fer_inc <- inner_join(income, fertility, by = c("country", "year"))
names(fer_inc) <- c("country", "year", "income", "fertility")
head(fer_inc)
summary(fer_inc$income)
ggplot(aes(x = year, y = income), data = fer_inc) +
geom_point(alpha = 0.2, color = 'orange') +
coord_cartesian(ylim=c(300, 190000)) +
scale_y_log10() +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = median, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, probs = .1, linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, probs = .9, linetype = 2, color = 'blue')
fer_inc[!is.na(fer_inc$income) & fer_inc$income < 300,]
Observations:
fer_inc.means <- fer_inc %>%
group_by(year) %>%
summarise(mean_income = mean(income, na.rm = TRUE),
mean_fertility = mean(fertility, na.rm = TRUE),
n = n()) %>%
arrange(year)
head(fer_inc.means)
p1 <- ggplot(aes(x = year), data = fer_inc.means) +
geom_point(aes(y = mean_income), color = 'red')
p2 <- ggplot(aes(x = year), data = fer_inc.means) +
geom_point(aes(y = mean_fertility), color = 'blue')
library(gridExtra)
grid.arrange(p1, p2, ncol=1)
cor.test(fer_inc$income, fer_inc$fertility)
Observations:
Notes:
Notes: the idea is that perceived audience size accuracy might change with age (older people being more accurate about the size).
Scatterplot with coloured points (cololur => age) didn’t show a clear answer.
Notes:
library(ggplot2)
library(ggthemes)
theme_set(theme_minimal())
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
# Female FB users are generally older than males.
pf %>% subset(!is.na(gender)) %>%
ggplot(data = ., aes(x = gender, y = age)) +
geom_boxplot() +
stat_summary(fun.y = mean, geom = "point", shape = 4) +
theme(text = element_text(face= "bold",size = 20))
pf %>% subset(!is.na(gender)) %>%
ggplot(data = ., aes(x = age, y = friend_count)) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median) +
theme(text = element_text(face= "bold",size = 20))
Write code to create a new data frame, called ‘pf.fc_by_age_gender’, that contains
information on each age AND gender group.
The data frame should contain the following variables:
library(dplyr)
pf.fc_by_age_gender <-
pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n = n()) %>%
ungroup() %>%
arrange(age)
pf.fc_by_age_gender
Notes: with the conditional summaries inside dataframe, we no longer need to use ‘stat’ and ‘fun.y’ parameters to geom_line.
pf.fc_by_age_gender %>%
ggplot(data = ., aes(x = age, y = median_friend_count)) +
geom_line(aes(color = gender)) +
theme(text = element_text(face= "bold",size = 20))
Notes: charts shows that friend count difference is larger for younger users.
We’d like to see a ratio: how many more friends do girls have. Does the ratio hold in time?
Notes:
Long format: (age, gender) -> (observations)
Wide format: (age) -> (observation for male, observation for female)
Notes:
library(tidyr)
library(reshape2)
# rehape with tidyr (note 'spread' and 'mutate')
pf.fc_by_age_gender.wide <-
pf.fc_by_age_gender %>%
select(age, gender, median_friend_count) %>%
subset(!is.na(gender)) %>%
spread(gender, median_friend_count) %>%
mutate(ratio = male / female)
head(pf.fc_by_age_gender.wide)
pf.fc_by_age_gender.wide <-
pf.fc_by_age_gender %>%
dcast(age ~ gender, value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide)
Notes:
# Plot the ratio of the female to male median friend counts
# using the data frame pf.fc_by_age_gender.wide.
# Think about what geom you should use.
# Add a horizontal line to the plot with a y intercept of 1, which will be the base line.
# Look up the documentation for geom_hline to do that. Use the parameter linetype in geom_hline to make the line dashed.
# The linetype parameter can take the values 0-6:
# 0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash
pf.fc_by_age_gender.wide %>%
ggplot(aes(x = age, y = female / male)) +
geom_line(color = '#02a0e2') +
geom_hline(yintercept = 1, color = 'red', linetype = 2) +
theme(text = element_text(face= "bold",size = 20))
Notes: from the graph above, female have more friends than males (> 2 times more for young users).
Hypothesis: there is lots of male who are new-joiners and haven't build their friends list yet.
We want to analyze the ratio based on account age (tenure).
pf$year_joined <- 2014 - ceiling(pf$tenure / 365)
# optionally: pf$year_joined <- floor(2014 - pf$tenure / 365)
Notes: there are not many users for 2005 and 2006, so we want to group the early years users together.
table(pf$year_joined)
pf$year_joined.bucket <- cut(pf$year_joined, breaks = c(2004, 2009, 2011, 2012, 2014))
summary(pf$year_joined.bucket)
Notes: plot belows confirms the thesies: user who are longer of facebook (bigger tenure) have more friends.
# Create a line graph of friend_count vs. age so that each year_joined.bucket is a line
# tracking the median user friend_count across age. This means you should have four different
# lines on your plot.
# You should subset the data to exclude the users whose year_joined.bucket is NA.
pf %>%
subset(!is.na(year_joined.bucket)) %>%
ggplot(data = ., aes(x = age, y = friend_count)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median) +
theme(text = element_text(face= "bold",size = 20))
Notes: grand mean is low (below the [2011-2012] cohort), which means that large number of users are in the last cohort
and bring the grand mean down.
# Write code to do the following:
# (1) Add another geom_line to code below to plot the grand mean of the friend count vs age.
# (2) Exclude any users whose year_joined.bucket is NA.
# (3) Use a different line type for the grand mean.
# As a reminder, the parameter linetype can take the values 0-6:
# 0 = blank, 1 = solid, 2 = dashed, # 3 = dotted, 4 = dotdash, 5 = longdash, # 6 = twodash
pf %>%
subset(!is.na(year_joined.bucket)) %>%
ggplot(data = ., aes(x = age, y = friend_count)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
geom_line(stat = "summary", fun.y = mean, color = 'blue', alpha = 0.5, linetype = 'dashed')+
theme(text = element_text(face= "bold",size = 20))
Notes:
friend_rate <- subset(pf, tenure >= 1)
friend_rate <- friend_rate$friend_count / friend_rate$tenure
summary(friend_rate)
# alt
# with(subset(pf, tenure >= 1), summary(friend_count / tenure))
Notes:
What is the median friend rate?
0.2205
What is the maximum friend rate?
417.0000
# Create a line graph of mean of friendships_initiated per day (of tenure)
# vs. tenure colored by year_joined.bucket.
# You need to make use of the variables tenure, friendships_initiated, and year_joined.bucket.
pf %>%
subset(tenure >= 1) %>%
ggplot(data = ., aes(x = tenure, y = friendships_initiated / tenure)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
theme(text = element_text(face= "bold",size = 20))
pf %>%
subset(tenure >= 1) %>%
ggplot(data = ., aes(x = tenure, y = friendships_initiated / tenure)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
scale_y_sqrt() +
theme(text = element_text(face= "bold",size = 20))
Notes: understanding bias vs variance: http://scott.fortmann-roe.com/docs/BiasVariance.html
Books: The Elements of Statistical Learning: http://statweb.stanford.edu/~tibs/ElemStatLearn/
pf %>%
subset(tenure >= 1) %>%
ggplot(data = ., aes(x = tenure, y = friendships_initiated / tenure)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
theme(text = element_text(face= "bold",size = 20))
pf %>%
subset(tenure > 0) %>%
ggplot(data = ., aes(x = 7 * round(tenure / 7),
y = friendships_initiated / tenure)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
theme(text = element_text(face= "bold",size = 20))
pf %>%
subset(tenure > 0) %>%
ggplot(data = ., aes(x = 30 * round(tenure / 30),
y = friendships_initiated / tenure)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
theme(text = element_text(face= "bold",size = 20))
pf %>%
subset(tenure > 0) %>%
ggplot(data = ., aes(x = 90 * round(tenure / 90),
y = friendships_initiated / tenure)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
theme(text = element_text(face= "bold",size = 20))
# finally, we can use geom_smooth to add smoother to the plot
pf %>%
subset(tenure > 0) %>%
ggplot(data = ., aes(x = 7 * round(tenure / 7),
y = friendships_initiated / tenure)) +
geom_smooth(aes(color = year_joined.bucket)) +
theme(text = element_text(face= "bold",size = 20))
Links:
Notes:
Notes: yogurt dataset contains multiple rows per household; one per each transaction.
yo <- read.csv('/Users/ndvo/Dropbox/udacity-data-science-master/p4/l5/yogurt.csv')
yo %>% str
yo %>% head
Notes:
# Change the id from an int to a factor
yo$id <- factor(yo$id)
yo$id %>% str
yo %>%
ggplot(data = ., aes(x= yo$price)) +
geom_histogram() +
theme(text = element_text(face= "bold",size = 20))
What do you notice?:
Notes:
summary(yo$price)
unique(yo$price)
# Create a new variable called all.purchases, which gives the total counts of yogurt for
# each observation or household.
yo$all.purchases <- rowSums(yo[,c('strawberry', 'blueberry', 'pina.colada', 'plain', 'mixed.berry')])
Notes:
yo %>%
ggplot(aes(x = time, y = price), data = .) +
geom_point(alpha = 0.1, color = 'blue') +
theme(text = element_text(face= "bold",size = 20))
Notes: when beginning with the dataset it’s often good to look at sample of the data.
In the yogurt dataset, it’s best to take a look at particular households to see variance within the household
and between the households.
# set the seed for reproducible results
set.seed(4000)
sample.ids <- sample(levels(yo$id), 16)
yo %>%
subset(id %in% sample.ids) %>%
ggplot(aes(x = time, y = price), data = .) +
facet_wrap(~ id) +
geom_line(aes(color = id)) +
guides(color = FALSE) +
geom_point(aes(size = all.purchases), pch = 1) +
theme(text = element_text(face= "bold",size = 15))
Notes:
Seed 4000:
Notes: if we have observations over time, we can facet them using primary id (household id)
This cannot be done with FB data, because it’s snapshot of friendship counts at one moment in time.
Notes: much of the analysis so far was focused on some pre-chosen variable, pair of variables, etc.
But we might:
We can speed up the analysis, by producing many plots & comparisons at once.
library(GGally)
theme_set(theme_minimal(20))
# set the seed for reproducible results
set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ], axisLabels = 'internal',mapping=ggplot2::aes(colour = gender))
Notes:
Notes:
Notes:
melting data: http://www.r-bloggers.com/melt/
nci <- read.table("/Users/ndvo/Dropbox/udacity-data-science-master/p4/l5/nci.tsv")
# changing the colnames to produce a nicer plot
nci %>% dim
colnames(nci) <- c(1:64)
nci[1:200,] %>% as.matrix %>% str
nci[1:200,] %>% as.matrix %>% melt %>% str
nci[1:200,] %>% as.matrix %>% melt %>% head
nci.long.samp <- nci[1:200,] %>% as.matrix %>% melt
names(nci.long.samp) <- c("gene", "case", "value")
nci.long.samp %>% head
nci.long.samp %>%
ggplot(aes(x = case, y = gene, fill = value), data = .) +
geom_tile() +
scale_fill_gradientn(colors = colorRampPalette(c("blue","red"))(100)) +
theme(text = element_text(face= "bold",size = 20))
Reflection:
library(ggplot2)
library(ggthemes)
theme_set(theme_minimal())
library(dplyr)
data(diamonds)
# Create a histogram of diamond prices.
# Facet the histogram by diamond color and use cut to color the histogram bars.
# The plot should look something like this. http://i.imgur.com/b5xyrOu.jpg
# Note: In the link, a color palette of type 'qual' was used to color the histogram using
# scale_fill_brewer(type = 'qual')
diamonds %>%
ggplot(aes(x= price), data = .) +
geom_histogram(aes(fill = cut), position = 'stack') +
facet_wrap(~ color) +
scale_fill_brewer("cut", type = 'qual', palette = "Set1") +
theme(text = element_text(face= "bold",size = 20))
Observations:
# Create a scatterplot of diamond price vs. table and color the points by the cut of
# the diamond.
# The plot should look something like this.
# http://i.imgur.com/rQF9jQr.jpg
diamonds %>%
ggplot(aes(x = table, y = price), data = .) +
geom_point(aes(color = cut), size = 3) +
# add scale and limits to see typical range easily
scale_x_continuous(limits = c(50, 70), breaks=seq(50, 70, 1)) +
scale_color_brewer("cut", type = 'qual', palette = 1) +
theme(text = element_text(face= "bold",size = 20))
Observations:
What is the typical table range for the majority of diamonds of IDEAL cut? 53-57
What is the typical table range for the majority of diamonds of PREMIUM cut? 58-62
# Create a scatterplot of diamond price vs. volume (x * y * z) and color the points by
# the clarity of diamonds.
# Use scale on the y-axis to take the log10 of price. You should also omit the top 1% of diamond volumes from the plot.
# Note: Volume is a very rough approximation of a diamond's actual volume.
# The plot should look something like this. http://i.imgur.com/excUpea.jpg
# Note: In the link, a color palette of type 'div' was used to color the scatterplot using
# scale_color_brewer(type = 'div')
diamonds$volume = diamonds$x * diamonds$y * diamonds$z
diamonds %>%
subset(volume < quantile(diamonds$volume, 0.99)) %>%
ggplot(aes(x = volume, y = price), data = .) +
geom_point(aes(color = clarity)) +
scale_y_log10() +
scale_color_brewer(type = 'div') +
theme(text = element_text(face= "bold",size = 20))
Observations:
Many interesting variables are derived from two or more others.
For example, we might wonder how much of a person’s network on a service like Facebook the user actively initiated. Two users
with the same degree (or number of friends) might be very different if one initiated most of those connections on the
service, while the other initiated very few. So it could be useful to consider this proportion of existing friendships that
the user initiated. This might be a good predictor of how active a user is compared with their peers, or other traits, such as
personality (i.e., is this person an extrovert?).
Your task is to create a new variable called ‘prop_initiated’ in the Pseudo-Facebook data set. The variable should contain
the proportion of friendships that the user initiated.
pf = read.delim('pseudo_facebook.tsv')
pf$prop_initiated = pf$friendships_initiated / pf$friend_count
# Create a line graph of
# the median proportion of friendships initiated ('prop_initiated') vs. tenure
# and color the line segment by year_joined.bucket.
# Recall, we created year_joined.bucket in Lesson 5
# by first creating year_joined from the variable tenure.
# Then, we used the cut function on year_joined to create
# four bins or cohorts of users.
# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]
# The plot should look something like this.
# http://i.imgur.com/vNjPtDh.jpg
# OR this
# http://i.imgur.com/IBN1ufQ.jpg
pf$year_joined <- 2014 - ceiling(pf$tenure / 365)
pf$year_joined.bucket <- cut(pf$year_joined, breaks = c(2004, 2009, 2011, 2012, 2014))
pf %>%
ggplot(aes(x= tenure, y = prop_initiated), data = .) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median) +
theme(text = element_text(face= "bold",size = 20))
Observations:
# Smooth the last plot you created of of prop_initiated vs tenure colored by
# year_joined.bucket.
# You can bin together ranges of tenure or add a smoother to the plot.
pf %>%
ggplot(aes(x = 30 * round(tenure / 30), y = prop_initiated), data = .) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary', fun.y = median) +
theme(text = element_text(face= "bold",size = 20))
pf %>%
ggplot(aes(tenure, y = prop_initiated), data = .) +
geom_smooth(aes(color = year_joined.bucket)) +
theme(text = element_text(face= "bold",size = 20))
For the group with the largest propotion of the friendships initiated, what is the group’s average (mean)
proportion of friendships initiaited?
with(subset(pf, pf$year_joined.bucket == "(2012,2014]"), table(year_joined.bucket))
with(subset(pf, year_joined.bucket == "(2012,2014]"), mean(prop_initiated, na.rm=TRUE))
# Create a scatter plot of the price/carat ratio of diamonds.
# The variable x should be assigned to cut.
# The points should be colored by diamond color,
# and the plot should be faceted by clarity.
# The plot should look something like this.
# http://i.imgur.com/YzbWkHT.jpg.
# Note: In the link, a color palette of type 'div' was used to color the histogram using
# scale_color_brewer(type = 'div')
diamonds %>%
ggplot(aes(x = cut, y = price/carat), data = .) +
# use geom_jitter to spread the values inside single color bucket
geom_jitter(aes(color = color), alpha = 1, position = position_jitter(h = 0)) +
facet_wrap(~ clarity) +
scale_color_brewer(type = 'div') +
theme(text = element_text(face= "bold",size = 12))
Observations:
Gapminder data: http://www.gapminder.org/data/
In your investigation, examine 3 or more variables and create 2-5 plots that make use of the techniques from Lesson 5.
# scatter/line plot of GDP and education level of population, male and female
library(openxlsx)
library(tidyr)
literacy_total <- read.xlsx('/Users/ndvo/Dropbox/udacity-data-science-master/p4/ps5/indicator SE_ADT_LITR_ZS.xlsx', rowNames = TRUE)
literacy_total %>% head
literacy_total %>% str
literacy_total <- mutate(literacy_total, country = rownames(literacy_total))
literacy_total <- gather(literacy_total, year, "literacy.total", 1:(length(literacy_total)-1), convert=TRUE)
literacy_total <- arrange(literacy_total, country, year)
literacy_total <- subset(literacy_total, !is.na(literacy_total$literacy.total))
literacy_total %>% head
literacy_total %>% str
literacy_male = read.xlsx('/Users/ndvo/Dropbox/udacity-data-science-master/p4/ps5/indicator SE_ADT_LITR_MA_ZS.xlsx', rowNames = TRUE)
literacy_male %>% head
literacy_male %>% str
literacy_male <- mutate(literacy_male, country = rownames(literacy_male))
literacy_male <- gather(literacy_male, year, "literacy.male", 1:(length(literacy_male)-1), convert=TRUE)
literacy_male <- arrange(literacy_male, country, year)
literacy_male <- subset(literacy_male, !is.na(literacy_male$literacy.male))
literacy_male %>% head
literacy_male %>% str
literacy_female = read.xlsx('/Users/ndvo/Dropbox/udacity-data-science-master/p4/ps5/indicator SE_ADT_LITR_FE_ZS.xlsx', rowNames = TRUE)
literacy_female %>% head
literacy_female %>% str
literacy_female <- mutate(literacy_female, country = rownames(literacy_female))
literacy_female <- gather(literacy_female, year, "literacy.female", 1:(length(literacy_female)-1), convert=TRUE)
literacy_female <- arrange(literacy_female, country, year)
literacy_female <- subset(literacy_female, !is.na(literacy_female$literacy.female))
literacy_female %>% head
literacy_female %>% str
income <- read.xlsx('/Users/ndvo/Dropbox/udacity-data-science-master/p4/ps5/indicator gapminder gdp_per_capita_ppp.xlsx', rowNames = TRUE, rows=1:260)
income %>% head
income %>% str
income <- mutate(income, country=rownames(income))
income <- gather(income, year, "income", 1:(length(income)-1), convert = TRUE)
income <- arrange(income, country, year)
income %>% head
income %>% str
lit_inc <- inner_join(income, literacy_total, by = c("country", "year"))
lit_inc <- inner_join(lit_inc, literacy_male, by = c("country", "year"))
lit_inc <- inner_join(lit_inc, literacy_female, by = c("country", "year"))
lit_inc$country.fact = factor(lit_inc$country)
lit_inc %>% head
lit_inc %>% str
lit_inc2 <- gather(lit_inc, "group", "literacy", 4:6)
lit_inc2$group.fact = factor(lit_inc2$group)
lit_inc2 %>% head
lit_inc2 %>% str
lit_inc2 %>%
ggplot(aes(x = round(income/2000)*2000, y = literacy), data = .) +
geom_line(aes(color = group.fact), stat = 'summary', fun.y = mean) +
scale_x_log10() +
theme(text = element_text(face= "bold",size = 20))
Observations:
set.seed(4000)
sample.countries <- sample(levels(lit_inc$country.fact), 16)
lit_inc %>%
subset(country %in% sample.countries) %>%
ggplot(aes(x = literacy.total, y = income, color = country.fact), data = .) +
geom_point(size = 3) +
scale_y_log10() +
facet_wrap(~ country.fact)+
theme(text = element_text(face= "bold",size = 18)) +
guides(color = FALSE)
Observations: