setwd("E:/Dropbox/FSEGA/cursuri/2016-2017/semestrul 2/R/date")
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
setwd("E:/Dropbox/FSEGA/cursuri/2016-2017/semestrul 2/R/curs5")
str(pf)
## 'data.frame': 99003 obs. of 15 variables:
## $ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
## $ age : int 14 14 14 14 14 14 13 13 13 13 ...
## $ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
## $ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
## $ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
## $ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
## $ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
r <- cor.test(pf$age, pf$friend_count)
r
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
r$estimate
## cor
## -0.02740737
round(r$estimate,3)
## cor
## -0.027
library(ggplot2)
qplot(pf$age, pf$friend_count, color=I('blue'))
with(pf, cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(subset(pf, pf$age<60), cor.test(age,friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -56.465, df = 83426, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1983880 -0.1853162
## sample estimates:
## cor
## -0.1918606
with(pf, cor.test(likes_received, www_likes_received))
##
## Pearson's product-moment correlation
##
## data: likes_received and www_likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
ggplot(aes(y=likes_received, x=www_likes_received), data=pf)+
geom_point()
ggplot(aes(y=likes_received, x=www_likes_received), data=pf)+
geom_point()+
coord_cartesian(xlim=c(0,10000), ylim=c(0,10000))
sau putem sa setam automat axele sa cuprinda quantile:
ggplot(aes(y=likes_received, x=www_likes_received), data=pf)+
geom_point()+
xlim(0,quantile(pf$www_likes_received, 0.95))+
ylim(0,quantile(pf$likes_received,0.95))
## Warning: Removed 6075 rows containing missing values (geom_point).
Observatie: putem folosi corelatia pentru a aprecia legatura dintre variabile, dar e bine sa ne uitam si la reprezentarea grafica…
Incarcam un tabel nou care contine temperatura solului la 20 de cm adancime in localitatea Mitchell, Nebraska, pe o perioada de 17 ani.
install.packages("alr3", repos = "http://cran.us.r-project.org", dependencies = TRUE)
## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'alr3' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ro\AppData\Local\Temp\RtmpUPfAlV\downloaded_packages
install.packages('car', repos = "http://cran.us.r-project.org" )
## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ro\AppData\Local\Temp\RtmpUPfAlV\downloaded_packages
install.packages('lme4', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## also installing the dependency 'RcppEigen'
## package 'RcppEigen' successfully unpacked and MD5 sums checked
## Warning: unable to move temporary installation 'C:\Users\ro\Documents\R
## \win-library\3.3\file28305df72469\RcppEigen' to 'C:\Users\ro\Documents\R
## \win-library\3.3\RcppEigen'
## package 'lme4' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ro\AppData\Local\Temp\RtmpUPfAlV\downloaded_packages
library(alr3)
## Warning: package 'alr3' was built under R version 3.3.3
## Loading required package: car
## Warning: package 'car' was built under R version 3.3.3
data(Mitchell)
# ?Mitchell
ggplot(aes(x=Month, y=Temp), data=Mitchell)+
geom_point()
si calculam corelatia:
with(Mitchell, cor.test(Month, Temp))
##
## Pearson's product-moment correlation
##
## data: Month and Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
si am putea spune ca nu… dar sa ne uitam mai ‘de aproape’ in R la date:
ggplot(aes(x=Month, y=Temp), data=Mitchell)+
geom_point()+
scale_x_continuous(breaks=seq(0,204,12))
si apoi daca il separam pe luni:
Mitchell$Mon <- Mitchell$Month %% 12
ggplot(aes(x=Month, y=Temp), data=Mitchell)+
geom_point()+
facet_wrap(~Mon, ncol=4)
Mitchell$Mon <- factor(Mitchell$Mon, labels=c('ianuarie', 'februarie', 'martie', 'aprilie',
'mai','iunie','iulie','august','septembrie','octombrie','noiembrie','decembrie'))
ggplot(aes(x=Month, y=Temp), data=Mitchell)+
geom_point()+
facet_wrap(~Mon, ncol=4)+
xlab('Luna')+
ylab('Temperatura')
ggplot(aes(x=friend_count), data=subset(pf, !is.na(gender)))+
geom_histogram()+
facet_wrap(~gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
apoi ne uitam si la distributia age/gender:
adaugam media la boxplot-uri
ggplot(aes(x=gender, y=age), data=subset(pf, !is.na(gender)))+
geom_boxplot()+
stat_summary(fun.y=mean, geom='point', shape=4)
ggplot(aes(x=age, y=friend_count), data=subset(pf, !is.na(gender)))+
geom_point()
ggplot(aes(x=age, y=friend_count), data=subset(pf, !is.na(gender)))+
geom_point(stat='summary', fun.y=median)
ggplot(aes(x=age, y=friend_count), data=subset(pf, !is.na(gender)))+
geom_line(stat='summary', fun.y=median)
-si in final sa vedem separat medianele pentru fiecare valoare a variabilei gender:
ggplot(aes(x=age, y=friend_count), data=subset(pf, !is.na(gender)))+
geom_line(stat='summary', fun.y=median, aes(color=gender))
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Vom separa datele pe age/gender :
age_gender <- group_by(pf, age, gender)
-apoi construim un tabel nou care sa contina statisticile pe care le dorim pentru fiecare grup in age_gender
pf.fc_si_age_si_gender <- summarise(age_gender,
media_fc=mean(friend_count),
mediana_fc=median(friend_count),
n=n())
head(pf.fc_si_age_si_gender, 15)
## Source: local data frame [15 x 5]
## Groups: age [7]
##
## age gender media_fc mediana_fc n
## <int> <fctr> <dbl> <dbl> <int>
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
## 7 15 NA 116.0000 116.0 1
## 8 16 female 519.5145 258.5 1238
## 9 16 male 239.6748 136.0 1848
## 10 17 female 538.9943 245.5 1236
## 11 17 male 236.4924 125.0 2045
## 12 17 NA 106.5000 106.5 2
## 13 18 female 481.9794 243.0 2037
## 14 18 male 233.9183 122.0 3159
## 15 19 female 470.8007 229.0 1606
ggplot(aes(x=age, y=mediana_fc), data=pf.fc_si_age_si_gender)+
geom_line(aes(color=gender))
ggplot(aes(x=age, y=mediana_fc), data=subset(pf.fc_si_age_si_gender, !is.na(gender)))+
geom_line(aes(color=gender))
head(pf.fc_si_age_si_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
##
## age gender media_fc mediana_fc n
## <int> <fctr> <dbl> <dbl> <int>
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
Avem nevoie de pachetul ‘reshape2’
install.packages('reshape2', repos = "http://cran.us.r-project.org")
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.3.3
Folosim functia dcast sa convertim tabelul pf.fc_si_age_si_gender:
pf.fc_si_age_si_genderLAT <- dcast(pf.fc_si_age_si_gender, age~gender, value.var = 'mediana_fc')
head(pf.fc_si_age_si_genderLAT, 15)
## age female male NA
## 1 13 148.0 55.0 NA
## 2 14 224.0 92.5 NA
## 3 15 276.0 106.5 116.0
## 4 16 258.5 136.0 NA
## 5 17 245.5 125.0 106.5
## 6 18 243.0 122.0 NA
## 7 19 229.0 130.0 374.0
## 8 20 190.0 112.0 65.0
## 9 21 158.0 108.0 79.0
## 10 22 124.0 97.0 NA
## 11 23 106.5 86.0 246.5
## 12 24 112.0 84.0 NA
## 13 25 78.0 55.0 15.0
## 14 26 82.0 70.0 65.0
## 15 27 81.0 68.0 7.0
age~gender: prima e variabila care ramane pivot in tabelul nou, iar a doua e cea care e impartita in mai multe coloane (in functie de cate valori ia)
value.var - variabila care ramane pastrata in coloanele noi create
ggplot(aes(x=age), data=pf.fc_si_age_si_genderLAT)+
geom_line( aes(y=male), color=I("#66C2A5"))+
geom_line( aes(y=female), color=I("#FC8D62"))
Reprezentam grafic raportul dintre numarul de femei si de barbati:
ggplot(aes(x=age, y=female/male), data=pf.fc_si_age_si_genderLAT)+
geom_line()
ggplot(aes(x=age, y=female/male), data=pf.fc_si_age_si_genderLAT)+
geom_line()+
geom_hline(yintercept=1, alpha=0.4, linetype=2, color=I('red'))
Vrem sa construim o variabila noua care sa pastreze anul in care s-a inscris o persoana pe facebook (stim tenure - nr de zile de cand e utilizator)
luam de referinta anul 2017:
pf$year_joined <- 2017-ceiling(pf$tenure/365)
? ceiling
## starting httpd help server ...
## done
head (pf, 15)
## userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382 14 19 1999 11 male 266 0
## 2 1192601 14 2 1999 11 female 6 0
## 3 2083884 14 16 1999 11 male 13 0
## 4 1203168 14 25 1999 12 female 93 0
## 5 1733186 14 4 1999 12 male 82 0
## 6 1524765 14 1 1999 12 male 15 0
## 7 1136133 13 14 2000 1 male 12 0
## 8 1680361 13 4 2000 1 female 0 0
## 9 1365174 13 1 2000 1 male 81 0
## 10 1712567 13 2 2000 2 male 171 0
## 11 1612453 13 22 2000 2 male 98 0
## 12 2104073 13 1 2000 2 male 55 0
## 13 1918584 13 5 2000 3 male 106 0
## 14 1704433 13 21 2000 3 male 61 0
## 15 1932519 13 28 2000 3 female 0 0
## friendships_initiated likes likes_received mobile_likes
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## 10 0 0 0 0
## 11 0 0 0 0
## 12 0 0 0 0
## 13 0 0 0 0
## 14 0 0 0 0
## 15 0 0 0 0
## mobile_likes_received www_likes www_likes_received year_joined
## 1 0 0 0 2016
## 2 0 0 0 2016
## 3 0 0 0 2016
## 4 0 0 0 2016
## 5 0 0 0 2016
## 6 0 0 0 2016
## 7 0 0 0 2016
## 8 0 0 0 2017
## 9 0 0 0 2016
## 10 0 0 0 2016
## 11 0 0 0 2016
## 12 0 0 0 2016
## 13 0 0 0 2016
## 14 0 0 0 2016
## 15 0 0 0 2017
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2008 2015 2015 2015 2016 2017 2
table(pf$year_joined)
##
## 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
## 9 15 581 1507 4557 5448 9860 33366 43588 70
qplot(pf$year_joined)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
pf$year_joined_intervale <- cut(pf$year_joined, breaks=c(2008,2013,2014, 2016,2017))
table(pf$year_joined_intervale)
##
## (2008,2013] (2013,2014] (2014,2016] (2016,2017]
## 12108 9860 76954 70
ggplot(aes(x=age, y=friend_count), data=subset(pf, !is.na(year_joined_intervale)))+
geom_line(aes(color=year_joined_intervale), stat='summary', fun.y=median)
Incarcati tabelul diamonds in RStudio (fie il descarcati de pe site-ul cursului si il imprortati in R, fie il incarcati folosind data(diamonds), acesta fiind disponibil in pachetul ggplot2)
Cate observatii are tabelul?
Cate variabile de tip factor sunt in tabel? Cate din ele sunt factor ordonat?
Construiti o histograma cu preturile diamantelor.
Calculati media si mediana preturilor.
Cate diamante au pretul mai mic de 500$? Dar mai mic de 250? Dar mai mare sau egal cu 15000?
Personalizati histograma de la punctul 4 in 2 moduri (la alegere)
Impartiti histograma de mai sus in functie de variabila cut.
Care tip de diamant (cut) are:
cel mai mare pret?
cel mai mic pret?
cea mai mica valoare mediana?
Construiti boxplots pentru pret in functie de cut/clarity/color.
Reprezentati ca si nor de puncte perechile de variabile: price/depth, price/carat, depth/carat. Asezati cele trei grafice impreuna pe acelasi grafic.
Care din variabilele de mai sus sun cel mai puernic corelate? Colorati punctele de pe graficul corespunzator cu albastru.