Pregatirea datelor:

setwd("E:/Dropbox/FSEGA/cursuri/2016-2017/semestrul 2/R/date")
yo <- read.csv('yogurt.csv')
setwd("E:/Dropbox/FSEGA/cursuri/2016-2017/semestrul 2/R/curs6")
str(yo)
## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : int  2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
names(yo)
## [1] "obs"         "id"          "time"        "strawberry"  "blueberry"  
## [6] "pina.colada" "plain"       "mixed.berry" "price"

Convertim variabila ID in factor:

yo$id <- factor(yo$id)
str(yo)
## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...

Histograma variabilei price:

ggplot(aes(x=price), data=yo)+
  geom_histogram(binwidth = 1, fill=I('orange'))

summary(yo$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   50.00   65.04   59.25   68.96   68.96
unique(yo$price)
##  [1] 58.96 65.04 48.96 68.96 39.04 24.96 50.00 45.04 33.04 44.00 33.36
## [12] 55.04 62.00 20.00 49.60 49.52 33.28 63.04 33.20 33.52
length(unique(yo$price))
## [1] 20
table(yo$price)
## 
##    20 24.96 33.04  33.2 33.28 33.36 33.52 39.04    44 45.04 48.96 49.52 
##     2    11    54     1     1    22     1   234    21    11    81     1 
##  49.6    50 55.04 58.96    62 63.04 65.04 68.96 
##     1   205     6   303    15     2   799   609

Construim o variabila noua care contine numarul de iaurturi cumparate la un moment dat de o anumita familie

yo <- transform(yo, all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry)

sau putem scrie:

yo$all.purchases <- yo$strawberry+yo$blueberry+yo$pina.colada+yo$plain+yo$mixed.berry

verific:

summary(yo$all.purchases)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.971   2.000  21.000

si

ggplot(aes(x=all.purchases), data=yo)+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(aes(x=time, y=price), data=yo)+
  geom_point()


Extragerea unui esantion

Vrem sa verificam un numar de gospodarii alese aleator.

set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
sample.ids
##  [1] "2107953" "2123463" "2167320" "2127605" "2124750" "2133066" "2134676"
##  [8] "2141341" "2107706" "2151829" "2119693" "2122705" "2115006" "2143271"
## [15] "2101980" "2101758"
length(levels(yo$id))
## [1] 332
sample.ids <- sample(levels(yo$id), 16)
sample.ids
##  [1] "2143503" "2120378" "2133983" "2124511" "2139774" "2121095" "2165951"
##  [8] "2147991" "2158196" "2108100" "2141341" "2143271" "2120436" "2102715"
## [15] "2107953" "2114025"
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
sample.ids
##  [1] "2107953" "2123463" "2167320" "2127605" "2124750" "2133066" "2134676"
##  [8] "2141341" "2107706" "2151829" "2119693" "2122705" "2115006" "2143271"
## [15] "2101980" "2101758"

sample(x, size, replace = FALSE)

sample(4,10, replace=TRUE)
##  [1] 3 2 3 2 3 2 4 4 4 1
sample(100, 4, replace=FALSE)
## [1] 73 75 35  5

Generarea de numere aleatoare in R

set.seed(1)
runif(1)
## [1] 0.2655087
runif(10)
##  [1] 0.37212390 0.57285336 0.90820779 0.20168193 0.89838968 0.94467527
##  [7] 0.66079779 0.62911404 0.06178627 0.20597457
runif(10,100,200)
##  [1] 117.6557 168.7023 138.4104 176.9841 149.7699 171.7619 199.1906
##  [8] 138.0035 177.7445 193.4705
a=runif(10000,1,2)
qplot(a, bins=100)

rnorm(1)
## [1] -0.404868
rnorm(10)
##  [1]  0.7371500  0.4858456  0.8293131 -0.1701218  1.3859215  0.8880190
##  [7]  0.1819333 -0.5843820  1.7435776 -1.3925546
a=rnorm(10000,1,2)
qplot(a, bins=50)+xlim(-8,10)
## Warning: Removed 1 rows containing missing values (geom_bar).

rexp(2)
## [1] 0.7048225 1.8568433
rexp(2,rate=3)
## [1] 0.1678928 0.3716671
a=rexp(10000)
qplot(a, bins=50)

a=rexp(10000, rate=10)
qplot(a, bins=100)

a=rexp(10000, rate=0.5)
qplot(a, bins=100)


ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(size=all.purchases), pch=1)

ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(size=all.purchases), pch=1)+
  geom_line()+
  facet_wrap(~id)

ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(size=all.purchases, color=all.purchases), pch=1)+
  geom_line()+
  facet_wrap(~id)

ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(size=all.purchases, color=all.purchases))+
  geom_line()+
  facet_wrap(~id)+
  scale_color_gradient(low='lightblue', high='blue')

In final: mai multe variabile :)

install.packages("GGally",  repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'GGally' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ro\AppData\Local\Temp\RtmpkdCwgC\downloaded_packages
library(GGally)
## Warning: package 'GGally' was built under R version 3.3.3
set.seed(1834)
yo_subset <- yo[,c(3:7)]
names(yo_subset)
## [1] "time"        "strawberry"  "blueberry"   "pina.colada" "plain"
str(yo_subset)
## 'data.frame':    2380 obs. of  5 variables:
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])

ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
  theme_grey(base_size = 5)

install.packages("ggthemes",  repos = "http://cran.us.r-project.org") # Install 
## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'ggthemes' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ro\AppData\Local\Temp\RtmpkdCwgC\downloaded_packages
library(ggthemes) # Load
## Warning: package 'ggthemes' was built under R version 3.3.3
ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
  theme_economist(base_size = 5)

ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
  theme_wsj(base_size = 5)


Alt exemplu

Informatii despre bacsis-ul primit la un restaurant:

data(tips, package='reshape')
str(tips)
## 'data.frame':    244 obs. of  7 variables:
##  $ total_bill: num  17 10.3 21 23.7 24.6 ...
##  $ tip       : num  1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
##  $ smoker    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ day       : Factor w/ 4 levels "Fri","Sat","Sun",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ time      : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
##  $ size      : int  2 3 3 2 4 4 2 4 2 2 ...
ggpairs(data=tips, columns=1:3, title="tips data") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(data=tips, columns=1:3, title="tips data",
        upper=list(continuous='density'),
        lower=list(combo='facetdensity')) 

ggpairs(data=tips, columns=1:3, title="tips data",
        upper=list(continuous='points'),
        lower=list(combo='facetdensity'),
        diag=list(continuous='barDiag')
        ) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(data=tips, columns=1:5, title="tips data",
        upper=list(continuous='box'),
        lower=list(combo='dot', discrete='ratio')) 


ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(color=all.purchases))+
  scale_color_gradient(low='yellow', high='red')+
  theme_economist()

Exercitii

  1. Incarcati tabelul diamonds din ggplot2.

  2. Creati un nor de puncte a price si x folosind ggplot.

  3. Care e corelatia intre price si x?

  4. Care este corelatia dintre price si y?

  5. Care este corelatia dintre price si z?

  6. Creati un nor de puncte a variabilelor price si depth.

  7. Creati un nor de puncte pentru price si carat. Eliminati 1% din valorile cele mai mari pentru cele doua variabile.

  8. Calculati o variabila noua volum ca si produsul xyz.

  9. Verificati corelatia dintre pret si volum. Dar daca eliminati diamantele cu volum 0 sau mai mare de 800.

  10. Setati transparenta in graficul anterior.

  11. Folositi pachetul dplyr pentru a crea un tabel nou cu informatii despre diamante grupate in functie de valorile clarity, in care sa apara pretul mediu, mediana preturilor, pretul minim, pretul maxim, si numarul lor.

  12. Construiti o matrice de grafice care sa prezinte variabilele x,y,x, price, carat si clarity (folosind ggpairs)

  13. Atribuiti o tema (la alegere) tuturor graficelor.