Curs 6

Pregatirea datelor:

setwd("E:/Dropbox/FSEGA/cursuri/2016-2017/semestrul 2/R/date")
yo <- read.csv('yogurt.csv')
setwd("E:/Dropbox/FSEGA/cursuri/2016-2017/semestrul 2/R/curs6")
str(yo)

## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : int  2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.3

names(yo)

## [1] "obs"         "id"          "time"        "strawberry"  "blueberry"  
## [6] "pina.colada" "plain"       "mixed.berry" "price"

Convertim variabila ID in factor:

yo$id <- factor(yo$id)
str(yo)

## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...

Histograma variabilei price:

ggplot(aes(x=price), data=yo)+
  geom_histogram(binwidth = 1, fill=I('orange'))

statistici:

summary(yo$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   50.00   65.04   59.25   68.96   68.96

valori unice pentru pret:

unique(yo$price)

##  [1] 58.96 65.04 48.96 68.96 39.04 24.96 50.00 45.04 33.04 44.00 33.36
## [12] 55.04 62.00 20.00 49.60 49.52 33.28 63.04 33.20 33.52

cate valori unice sunt?

length(unique(yo$price))

## [1] 20

si frecventa lor care e?

table(yo$price)

## 
##    20 24.96 33.04  33.2 33.28 33.36 33.52 39.04    44 45.04 48.96 49.52 
##     2    11    54     1     1    22     1   234    21    11    81     1 
##  49.6    50 55.04 58.96    62 63.04 65.04 68.96 
##     1   205     6   303    15     2   799   609

Construim o variabila noua care contine numarul de iaurturi cumparate la un moment dat de o anumita familie

yo <- transform(yo, all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry)

sau putem scrie:

yo$all.purchases <- yo$strawberry+yo$blueberry+yo$pina.colada+yo$plain+yo$mixed.berry

verific:

summary(yo$all.purchases)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.971   2.000  21.000

ggplot(aes(x=all.purchases), data=yo)+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

nor de puncte:

ggplot(aes(x=time, y=price), data=yo)+
  geom_point()

Extragerea unui esantion

Vrem sa verificam un numar de gospodarii alese aleator.

pentru a obtine aceleasi rezutate setam radacina generatorului de numere aleatoare:

set.seed(4230)

sample.ids <- sample(levels(yo$id), 16)

sample.ids

##  [1] "2107953" "2123463" "2167320" "2127605" "2124750" "2133066" "2134676"
##  [8] "2141341" "2107706" "2151829" "2119693" "2122705" "2115006" "2143271"
## [15] "2101980" "2101758"

length(levels(yo$id))

## [1] 332

sample.ids <- sample(levels(yo$id), 16)

sample.ids

##  [1] "2143503" "2120378" "2133983" "2124511" "2139774" "2121095" "2165951"
##  [8] "2147991" "2158196" "2108100" "2141341" "2143271" "2120436" "2102715"
## [15] "2107953" "2114025"

set.seed(4230)

sample.ids <- sample(levels(yo$id), 16)

sample.ids

##  [1] "2107953" "2123463" "2167320" "2127605" "2124750" "2133066" "2134676"
##  [8] "2141341" "2107706" "2151829" "2119693" "2122705" "2115006" "2143271"
## [15] "2101980" "2101758"

sample(x, size, replace = FALSE)

x un vector din care se alege; sau un numar intreg pozitiv;
size marimea esantionului
replace - cu revenire sau nu

sample(4,10, replace=TRUE)

##  [1] 3 2 3 2 3 2 4 4 4 1

sample(100, 4, replace=FALSE)

## [1] 73 75 35  5

Generarea de numere aleatoare in R

Distributia uniforma:

set.seed(1)
runif(1)

## [1] 0.2655087

runif(10)

##  [1] 0.37212390 0.57285336 0.90820779 0.20168193 0.89838968 0.94467527
##  [7] 0.66079779 0.62911404 0.06178627 0.20597457

runif(10,100,200)

##  [1] 117.6557 168.7023 138.4104 176.9841 149.7699 171.7619 199.1906
##  [8] 138.0035 177.7445 193.4705

a=runif(10000,1,2)
qplot(a, bins=100)

Distributia normala

rnorm(1)

## [1] -0.404868

rnorm(10)

##  [1]  0.7371500  0.4858456  0.8293131 -0.1701218  1.3859215  0.8880190
##  [7]  0.1819333 -0.5843820  1.7435776 -1.3925546

a=rnorm(10000,1,2)

qplot(a, bins=50)+xlim(-8,10)

## Warning: Removed 1 rows containing missing values (geom_bar).

rexp(2)

## [1] 0.7048225 1.8568433

rexp(2,rate=3)

## [1] 0.1678928 0.3716671

a=rexp(10000)

qplot(a, bins=50)

a=rexp(10000, rate=10)

qplot(a, bins=100)

a=rexp(10000, rate=0.5)

qplot(a, bins=100)

ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(size=all.purchases), pch=1)

ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(size=all.purchases), pch=1)+
  geom_line()+
  facet_wrap(~id)

ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(size=all.purchases, color=all.purchases), pch=1)+
  geom_line()+
  facet_wrap(~id)

ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(size=all.purchases, color=all.purchases))+
  geom_line()+
  facet_wrap(~id)+
  scale_color_gradient(low='lightblue', high='blue')

In final: mai multe variabile :)

install.packages("GGally",  repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)

## package 'GGally' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ro\AppData\Local\Temp\RtmpkdCwgC\downloaded_packages

library(GGally)

## Warning: package 'GGally' was built under R version 3.3.3

set.seed(1834)

yo_subset <- yo[,c(3:7)]

names(yo_subset)

## [1] "time"        "strawberry"  "blueberry"   "pina.colada" "plain"

str(yo_subset)

## 'data.frame':    2380 obs. of  5 variables:
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...

ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])

ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
  theme_grey(base_size = 5)

install.packages("ggthemes",  repos = "http://cran.us.r-project.org") # Install

## Installing package into 'C:/Users/ro/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)

## package 'ggthemes' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ro\AppData\Local\Temp\RtmpkdCwgC\downloaded_packages

library(ggthemes) # Load

## Warning: package 'ggthemes' was built under R version 3.3.3

ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
  theme_economist(base_size = 5)

ggpairs(yo_subset[sample.int(nrow(yo_subset),50),])+
  theme_wsj(base_size = 5)

Alt exemplu

Informatii despre bacsis-ul primit la un restaurant:

valoarea in dolari
nota de plata
sex-ul persoanei care a platit
sunt fumatori in grup
ziua din saptamana
ora
marimea grupului

data(tips, package='reshape')
str(tips)

## 'data.frame':    244 obs. of  7 variables:
##  $ total_bill: num  17 10.3 21 23.7 24.6 ...
##  $ tip       : num  1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
##  $ smoker    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ day       : Factor w/ 4 levels "Fri","Sat","Sun",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ time      : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
##  $ size      : int  2 3 3 2 4 4 2 4 2 2 ...

ggpairs(data=tips, columns=1:3, title="tips data")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(data=tips, columns=1:3, title="tips data",
        upper=list(continuous='density'),
        lower=list(combo='facetdensity'))

ggpairs(data=tips, columns=1:3, title="tips data",
        upper=list(continuous='points'),
        lower=list(combo='facetdensity'),
        diag=list(continuous='barDiag')
        )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(data=tips, columns=1:5, title="tips data",
        upper=list(continuous='box'),
        lower=list(combo='dot', discrete='ratio'))

ggplot(aes(x=time, y=price), data=subset(yo, id %in% sample.ids))+
  geom_point(aes(color=all.purchases))+
  scale_color_gradient(low='yellow', high='red')+
  theme_economist()

Exercitii

Incarcati tabelul diamonds din ggplot2.
Creati un nor de puncte a price si x folosind ggplot.
Care e corelatia intre price si x?
Care este corelatia dintre price si y?
Care este corelatia dintre price si z?
Creati un nor de puncte a variabilelor price si depth.
Creati un nor de puncte pentru price si carat. Eliminati 1% din valorile cele mai mari pentru cele doua variabile.
Calculati o variabila noua volum ca si produsul xyz.
Verificati corelatia dintre pret si volum. Dar daca eliminati diamantele cu volum 0 sau mai mare de 800.
Setati transparenta in graficul anterior.
Folositi pachetul dplyr pentru a crea un tabel nou cu informatii despre diamante grupate in functie de valorile clarity, in care sa apara pretul mediu, mediana preturilor, pretul minim, pretul maxim, si numarul lor.
Construiti o matrice de grafice care sa prezinte variabilele x,y,x, price, carat si clarity (folosind ggpairs)
Atribuiti o tema (la alegere) tuturor graficelor.