setwd("E:/Dropbox/FSEGA/cursuri/2016-2017/semestrul 2/R/date")
yo <- read.csv('yogurt.csv')
setwd("E:/Dropbox/FSEGA/cursuri/2016-2017/semestrul 2/R/curs6")
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
names(yo)
## [1] "obs" "id" "time" "strawberry" "blueberry"
## [6] "pina.colada" "plain" "mixed.berry" "price"
Convertim variabila ID in factor:
yo$id <- factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
Histograma variabilei price:
ggplot(aes(x=price), data=yo)+
geom_histogram(binwidth = 1, fill=I('orange'))
summary(yo$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 50.00 65.04 59.25 68.96 68.96
unique(yo$price)
## [1] 58.96 65.04 48.96 68.96 39.04 24.96 50.00 45.04 33.04 44.00 33.36
## [12] 55.04 62.00 20.00 49.60 49.52 33.28 63.04 33.20 33.52
length(unique(yo$price))
## [1] 20
table(yo$price)
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
Construim o variabila noua care contine numarul de iaurturi cumparate la un moment dat de o anumita familie
yo <- transform(yo, all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry)
sau putem scrie:
yo$all.purchases <- yo$strawberry+yo$blueberry+yo$pina.colada+yo$plain+yo$mixed.berry
verific:
summary(yo$all.purchases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.971 2.000 21.000
si
ggplot(aes(x=all.purchases), data=yo)+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(aes(x=time, y=price), data=yo)+
geom_point()
Vrem sa verificam un numar de gospodarii alese aleator.
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
sample.ids
## [1] "2107953" "2123463" "2167320" "2127605" "2124750" "2133066" "2134676"
## [8] "2141341" "2107706" "2151829" "2119693" "2122705" "2115006" "2143271"
## [15] "2101980" "2101758"
length(levels(yo$id))
## [1] 332
sample.ids <- sample(levels(yo$id), 16)
sample.ids
## [1] "2143503" "2120378" "2133983" "2124511" "2139774" "2121095" "2165951"
## [8] "2147991" "2158196" "2108100" "2141341" "2143271" "2120436" "2102715"
## [15] "2107953" "2114025"
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
sample.ids
## [1] "2107953" "2123463" "2167320" "2127605" "2124750" "2133066" "2134676"
## [8] "2141341" "2107706" "2151829" "2119693" "2122705" "2115006" "2143271"
## [15] "2101980" "2101758"
sample(x, size, replace = FALSE)
sample(4,10, replace=TRUE)
## [1] 3 2 3 2 3 2 4 4 4 1
sample(100, 4, replace=FALSE)
## [1] 73 75 35 5
set.seed(1)
runif(1)
## [1] 0.2655087
runif(10)
## [1] 0.37212390 0.57285336 0.90820779 0.20168193 0.89838968 0.94467527
## [7] 0.66079779 0.62911404 0.06178627 0.20597457
runif(10,100,200)
## [1] 117.6557 168.7023 138.4104 176.9841 149.7699 171.7619 199.1906
## [8] 138.0035 177.7445 193.4705
a=runif(10000,1,2)
qplot(a, bins=100)
rnorm(1)
## [1] -0.404868
rnorm(10)
## [1] 0.7371500 0.4858456 0.8293131 -0.1701218 1.3859215 0.8880190
## [7] 0.1819333 -0.5843820 1.7435776 -1.3925546
a=rnorm(10000,1,2)
qplot(a, bins=50)+xlim(-8,10)
## Warning: Removed 1 rows containing missing values (geom_bar).
rexp(2)
## [1] 0.7048225 1.8568433
rexp(2,rate=3)
## [1] 0.1678928 0.3716671
a=rexp(10000)
qplot(a, bins=50)
a=rexp(10000, rate=10)
qplot(a, bins=100)
a=rexp(10000, rate=0.5)
qplot(a, bins=100)
ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
geom_point(aes(size=all.purchases), pch=1)
ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
geom_point(aes(size=all.purchases), pch=1)+
geom_line()+
facet_wrap(~id)
ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
geom_point(aes(size=all.purchases, color=all.purchases), pch=1)+
geom_line()+
facet_wrap(~id)
ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
geom_point(aes(size=all.purchases, color=all.purchases))+
geom_line()+
facet_wrap(~id)+
scale_color_gradient(low='lightblue', high='blue')
install.packages("GGally", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'GGally' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ro\AppData\Local\Temp\RtmpkdCwgC\downloaded_packages
library(GGally)
## Warning: package 'GGally' was built under R version 3.3.3
set.seed(1834)
yo_subset <- yo[,c(3:7)]
names(yo_subset)
## [1] "time" "strawberry" "blueberry" "pina.colada" "plain"
str(yo_subset)
## 'data.frame': 2380 obs. of 5 variables:
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])
ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
theme_grey(base_size = 5)
install.packages("ggthemes", repos = "http://cran.us.r-project.org") # Install
## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'ggthemes' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ro\AppData\Local\Temp\RtmpkdCwgC\downloaded_packages
library(ggthemes) # Load
## Warning: package 'ggthemes' was built under R version 3.3.3
ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
theme_economist(base_size = 5)
ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
theme_wsj(base_size = 5)
Informatii despre bacsis-ul primit la un restaurant:
valoarea in dolari
nota de plata
sex-ul persoanei care a platit
sunt fumatori in grup
ziua din saptamana
ora
marimea grupului
data(tips, package='reshape')
str(tips)
## 'data.frame': 244 obs. of 7 variables:
## $ total_bill: num 17 10.3 21 23.7 24.6 ...
## $ tip : num 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
## $ smoker : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ day : Factor w/ 4 levels "Fri","Sat","Sun",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ time : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
## $ size : int 2 3 3 2 4 4 2 4 2 2 ...
ggpairs(data=tips, columns=1:3, title="tips data")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(data=tips, columns=1:3, title="tips data",
upper=list(continuous='density'),
lower=list(combo='facetdensity'))
ggpairs(data=tips, columns=1:3, title="tips data",
upper=list(continuous='points'),
lower=list(combo='facetdensity'),
diag=list(continuous='barDiag')
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(data=tips, columns=1:5, title="tips data",
upper=list(continuous='box'),
lower=list(combo='dot', discrete='ratio'))
ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
geom_point(aes(color=all.purchases))+
scale_color_gradient(low='yellow', high='red')+
theme_economist()
Incarcati tabelul diamonds din ggplot2.
Creati un nor de puncte a price si x folosind ggplot.
Care e corelatia intre price si x?
Care este corelatia dintre price si y?
Care este corelatia dintre price si z?
Creati un nor de puncte a variabilelor price si depth.
Creati un nor de puncte pentru price si carat. Eliminati 1% din valorile cele mai mari pentru cele doua variabile.
Calculati o variabila noua volum ca si produsul xyz.
Verificati corelatia dintre pret si volum. Dar daca eliminati diamantele cu volum 0 sau mai mare de 800.
Setati transparenta in graficul anterior.
Folositi pachetul dplyr pentru a crea un tabel nou cu informatii despre diamante grupate in functie de valorile clarity, in care sa apara pretul mediu, mediana preturilor, pretul minim, pretul maxim, si numarul lor.
Construiti o matrice de grafice care sa prezinte variabilele x,y,x, price, carat si clarity (folosind ggpairs)
Atribuiti o tema (la alegere) tuturor graficelor.