Incarcati tabelul pseudo_facebook.tsv
str(pf)
'data.frame': 99003 obs. of 15 variables:
$ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
$ age : int 14 14 14 14 14 14 13 13 13 13 ...
$ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
$ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
$ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
$ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
$ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
$ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
$ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
$ likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
$ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
$ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
cor.test(pf$age, pf$friend_count)
Pearson's product-moment correlation
data: pf$age and pf$friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.03363072 -0.02118189
sample estimates:
cor
-0.02740737
?cor.test
qplot(pf$age, pf$friend_count)
r=cor.test(pf$tenure, pf$friend_count)
r
Pearson's product-moment correlation
data: pf$tenure and pf$friend_count
t = 53.049, df = 98999, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.1601927 0.1723067
sample estimates:
cor
0.166256
r$estimate
cor
0.166256
r$p.value
[1] 0
data(iris)
str(iris)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
pairs(iris)
pairs(iris[1:4])
library(ggplot2)
ggplot(aes(x=Sepal.Length, y=Petal.Length), data=iris)+
geom_point(aes(color=Species))
install.packages('GGally')
library(GGally)
package <U+393C><U+3E31>GGally<U+393C><U+3E32> was built under R version 3.5.3
Attaching package: <U+393C><U+3E31>GGally<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:dplyr<U+393C><U+3E32>:
nasa
ggpairs(iris)
cor.test(iris[,1], iris[,3])
Pearson's product-moment correlation
data: iris[, 1] and iris[, 3]
t = 21.646, df = 148, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8270363 0.9055080
sample estimates:
cor
0.8717538
ggpairs(iris[1:4])
ggpairs(data=iris, mapping=aes(color=Species, cols=4))
with(iris, cor.test(Sepal.Length, Sepal.Width))
Pearson's product-moment correlation
data: Sepal.Length and Sepal.Width
t = -1.4403, df = 148, p-value = 0.1519
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.27269325 0.04351158
sample estimates:
cor
-0.1175698
cor.test(iris$Sepal.Length, iris$Sepal.Width)
Pearson's product-moment correlation
data: iris$Sepal.Length and iris$Sepal.Width
t = -1.4403, df = 148, p-value = 0.1519
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.27269325 0.04351158
sample estimates:
cor
-0.1175698
with(subset(iris, iris$Species=='versicolor'), cor.test(Sepal.Length, Sepal.Width))
Pearson's product-moment correlation
data: Sepal.Length and Sepal.Width
t = 4.2839, df = 48, p-value = 8.772e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2900175 0.7015599
sample estimates:
cor
0.5259107
with(pf, cor.test(likes_received, www_likes_received))
Pearson's product-moment correlation
data: likes_received and www_likes_received
t = 937.1, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9473553 0.9486176
sample estimates:
cor
0.9479902
ggplot(aes(x=likes_received, y=www_likes_received), data=pf)+
geom_point()+
coord_cartesian(xlim=c(0,quantile(pf$likes_received,0.99)), ylim=c(0,quantile(pf$www_likes_received,0.99)))
instalat pachet: alr3
library(alr3)
package <U+393C><U+3E31>alr3<U+393C><U+3E32> was built under R version 3.5.3Loading required package: car
package <U+393C><U+3E31>car<U+393C><U+3E32> was built under R version 3.5.3Loading required package: carData
Attaching package: <U+393C><U+3E31>car<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:dplyr<U+393C><U+3E32>:
recode
data("Mitchell")
str(Mitchell)
'data.frame': 204 obs. of 2 variables:
$ Month: int 0 1 2 3 4 5 6 7 8 9 ...
$ Temp : num -5.18 -1.65 2.49 10.4 14.99 ...
?Mitchell
ggplot(data=Mitchell, aes(x=Month, y=Temp))+
geom_point()
with(Mitchell, cor.test(Month, Temp))
Pearson's product-moment correlation
data: Month and Temp
t = 0.81816, df = 202, p-value = 0.4142
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.08053637 0.19331562
sample estimates:
cor
0.05747063
Mitchell$Luna=Mitchell$Month%%12
table(Mitchell$Luna)
0 1 2 3 4 5 6 7 8 9 10 11
17 17 17 17 17 17 17 17 17 17 17 17
ggplot(data=Mitchell, aes(x=Month, y=Temp))+
geom_point()+
facet_wrap(~Luna)
Mitchell$Luna <- factor(Mitchell$Luna, labels=c('ian','feb',
'mar','apr','mai','iun','iul','aug','sep',
'oct','nov','dec'))
ggplot(data=Mitchell, aes(x=Month, y=Temp))+
geom_point(aes(color=Luna))
library(dplyr)
pf_varsta_sex <- group_by(pf, age, gender)
Factor `gender` contains implicit NA, consider using `forcats::fct_explicit_na`
pf_varsta_sex_statistici <- summarise(pf_varsta_sex,
mediaFC=mean(friend_count),
medianaFC=median(friend_count),
n=n())
head(pf_varsta_sex_statistici)
ggplot(aes(x=age, y=medianaFC), data=subset(pf_varsta_sex_statistici, !is.na(gender)))+
geom_line(aes(color=gender))
library(reshape2)
package <U+393C><U+3E31>reshape2<U+393C><U+3E32> was built under R version 3.5.3
head(pf_varsta_sex_statistici)
pf_Nou <- dcast(pf_varsta_sex_statistici, age~gender, value.var = 'medianaFC')
head(pf_Nou)
ggplot(data=pf_Nou, aes(x=age))+
geom_line(aes(y=male, color=I('green')))+
geom_line(aes(y=female, color=I('blue')))
ggplot(data=pf_Nou, aes(x=age, y=female/male))+
geom_line()+
geom_hline(yintercept = 1, linetype=4, color=I('red'))
pf$an <- 2019-ceiling(pf$tenure/365)
?ceiling
table(pf$an)
2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
9 15 581 1507 4557 5448 9860 33366 43588 70
qplot(pf$an)
pf$an_intervale <- cut(pf$an, breaks=c(2010, 2013, 2015, 2017, 2019))
table(pf$an_intervale)
(2010,2013] (2013,2015] (2015,2017] (2017,2019]
2103 10005 43226 43658
head(pf)
str(pf)
'data.frame': 99003 obs. of 17 variables:
$ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
$ age : int 14 14 14 14 14 14 13 13 13 13 ...
$ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
$ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
$ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
$ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
$ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
$ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
$ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
$ likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
$ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
$ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
$ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
$ an : num 2018 2018 2018 2018 2018 ...
$ an_intervale : Factor w/ 4 levels "(2010,2013]",..: 4 4 4 4 4 4 4 4 4 4 ...
ggplot(data=pf, aes(x=age, y=friend_count))+
geom_line(stat='summary', fun.y=median, aes(color=an_intervale, linetype=an_intervale))
Incarcati tabelul diamonds in RStudio (fie il descarcati de pe site-ul cursului si il imprortati in R, fie il incarcati folosind data(diamonds), acesta fiind disponibil in pachetul ggplot2)
data("diamonds")
Cate observatii are tabelul?
str(diamonds)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 53940 obs. of 10 variables:
$ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
$ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
$ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
$ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
$ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
$ table : num 55 61 65 58 58 57 57 55 61 61 ...
$ price : int 326 326 327 334 335 336 336 337 337 338 ...
$ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
$ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
$ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
?diamonds
Cate variabile de tip factor sunt in tabel? Cate din ele sunt factor ordonat?
Construiti o histograma cu preturile diamantelor.
qplot(diamonds$price)
Calculati media si mediana preturilor.
mean(diamonds$price)
[1] 3932.8
median(diamonds$price)
[1] 2401
summary(diamonds$price)
Min. 1st Qu. Median Mean 3rd Qu. Max.
326 950 2401 3933 5324 18823
Cate diamante au pretul mai mic de 500$?
cate=diamonds$price<500
table(cate)
cate
FALSE TRUE
52211 1729
Dar mai mic de 250?
table(diamonds$price<250)
FALSE
53940
Dar mai mare sau egal cu 15000?
table(diamonds$price>=15000)
FALSE TRUE
52284 1656
Personalizati histograma de la punctul 4 in 2 moduri (la alegere)
qplot(diamonds$price, main='titlu', color=I('coral'), fill=I('coral'))
Impartiti histograma de mai sus in functie de variabila cut.
qplot(data=diamonds, x=price, main='titlu', color=I('coral'), fill=I('coral'))+
facet_wrap(~cut)
table(diamonds$cut)
Fair Good Very Good Premium Ideal
1610 4906 12082 13791 21551
Care tip de diamant (cut) are:
cel mai mare pret?
cel mai mic pret?
cea mai mica valoare mediana?
by(diamonds$price, diamonds$cut, summary)
diamonds$cut: Fair
Min. 1st Qu. Median Mean 3rd Qu. Max.
337 2050 3282 4359 5206 18574
---------------------------------------------------
diamonds$cut: Good
Min. 1st Qu. Median Mean 3rd Qu. Max.
327 1145 3050 3929 5028 18788
---------------------------------------------------
diamonds$cut: Very Good
Min. 1st Qu. Median Mean 3rd Qu. Max.
336 912 2648 3982 5373 18818
---------------------------------------------------
diamonds$cut: Premium
Min. 1st Qu. Median Mean 3rd Qu. Max.
326 1046 3185 4584 6296 18823
---------------------------------------------------
diamonds$cut: Ideal
Min. 1st Qu. Median Mean 3rd Qu. Max.
326 878 1810 3458 4678 18806
with(diamonds, by(price, cut, summary))
cut: Fair
Min. 1st Qu. Median Mean 3rd Qu. Max.
337 2050 3282 4359 5206 18574
---------------------------------------------------
cut: Good
Min. 1st Qu. Median Mean 3rd Qu. Max.
327 1145 3050 3929 5028 18788
---------------------------------------------------
cut: Very Good
Min. 1st Qu. Median Mean 3rd Qu. Max.
336 912 2648 3982 5373 18818
---------------------------------------------------
cut: Premium
Min. 1st Qu. Median Mean 3rd Qu. Max.
326 1046 3185 4584 6296 18823
---------------------------------------------------
cut: Ideal
Min. 1st Qu. Median Mean 3rd Qu. Max.
326 878 1810 3458 4678 18806
x=by(diamonds$price, diamonds$cut, max)
max(x)
[1] 18823
Construiti boxplots pentru pret in functie de cut/clarity/color.