Adapted from R for Data Science by Hadley Wichkam and Garrett Grolemund
library(tidyverse)
Maybe the most important step?
FAO1 <- read.csv("FAOSTAT_data.csv", header = TRUE)
library(tidyverse)
FAO1 <- read_csv("FAOSTAT_data.csv")
## Parsed with column specification:
## cols(
## Country = col_character(),
## Element = col_character(),
## Item = col_character(),
## Year = col_integer(),
## Unit = col_character(),
## Value = col_integer()
## )
FAO1
## # A tibble: 40 x 6
## Country Element Item Year Unit Value
## <chr> <chr> <chr> <int> <chr> <int>
## 1 Cambodia Area harvested Bananas 2007 ha 30000
## 2 Cambodia Area harvested Bananas 2008 ha 32000
## 3 Cambodia Area harvested Bananas 2009 ha 33000
## 4 Cambodia Area harvested Bananas 2010 ha 35002
## 5 Cambodia Area harvested Bananas 2011 ha 32574
## 6 Cambodia Area harvested Bananas 2012 ha 31844
## 7 Cambodia Area harvested Bananas 2013 ha 31757
## 8 Cambodia Area harvested Bananas 2014 ha 31741
## 9 Cambodia Area harvested Bananas 2015 ha 31786
## 10 Cambodia Area harvested Bananas 2016 ha 31875
## # ... with 30 more rows
Say that after import, you notice that you would like to change one of your variable types, you can do that by specifying the variable type for each column. For example, let’s say we want to change year to character:
FAO2 <- read_csv("FAOSTAT_data.csv", col_types = cols(
Country = col_character(),
Element = col_character(),
Item = col_character(),
Year = col_character(), ### changed year to character
Unit = col_character(),
Value = col_integer()
))
FAO2
## # A tibble: 40 x 6
## Country Element Item Year Unit Value
## <chr> <chr> <chr> <chr> <chr> <int>
## 1 Cambodia Area harvested Bananas 2007 ha 30000
## 2 Cambodia Area harvested Bananas 2008 ha 32000
## 3 Cambodia Area harvested Bananas 2009 ha 33000
## 4 Cambodia Area harvested Bananas 2010 ha 35002
## 5 Cambodia Area harvested Bananas 2011 ha 32574
## 6 Cambodia Area harvested Bananas 2012 ha 31844
## 7 Cambodia Area harvested Bananas 2013 ha 31757
## 8 Cambodia Area harvested Bananas 2014 ha 31741
## 9 Cambodia Area harvested Bananas 2015 ha 31786
## 10 Cambodia Area harvested Bananas 2016 ha 31875
## # ... with 30 more rows
FAOxls <- readxl::read_excel("FAOSTAT_data2.xlsx")
FAOxls
## # A tibble: 40 x 6
## Country Element Item Year Unit Value
## <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 Cambodia Area harvested Bananas 2007. ha 30000.
## 2 Cambodia Area harvested Bananas 2008. ha 32000.
## 3 Cambodia Area harvested Bananas 2009. ha 33000.
## 4 Cambodia Area harvested Bananas 2010. ha 35002.
## 5 Cambodia Area harvested Bananas 2011. ha 32574.
## 6 Cambodia Area harvested Bananas 2012. ha 31844.
## 7 Cambodia Area harvested Bananas 2013. ha 31757.
## 8 Cambodia Area harvested Bananas 2014. ha 31741.
## 9 Cambodia Area harvested Bananas 2015. ha 31786.
## 10 Cambodia Area harvested Bananas 2016. ha 31875.
## # ... with 30 more rows
Let’s take a look at three data sets (more info at this Rstudio webinar!)
#devtools::install_github("rstudio/EDAWR")
library(EDAWR)
##
## Attaching package: 'EDAWR'
## The following objects are masked _by_ '.GlobalEnv':
##
## a, b, y
## The following object is masked from 'package:dplyr':
##
## storms
## The following objects are masked from 'package:tidyr':
##
## population, who
stormsA <- EDAWR::storms ## so it doesn't get confused with built-in NOAA weather data!
First, ‘cases’:
cases
## country 2011 2012 2013
## 1 FR 7000 6900 7000
## 2 DE 5800 6000 6200
## 3 US 15000 14000 13000
Notice that each variable stored in its own cell:
Next, ‘polution’:
pollution
## city size amount
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
In ‘pollution’, variables “amount of small particles” and “amount of large particles” are not in their own column. Therefore, doing math with them would be difficult!
Finally, ‘stormsA’:
stormsA
## # tibble [6 × 4]
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Alberto 110 1007 2000-08-03
## 2 Alex 45 1009 1998-07-27
## 3 Allison 65 1005 1995-06-03
## 4 Ana 40 1013 1997-06-30
## 5 Arlene 50 1010 1999-06-11
## 6 Arthur 45 1010 1996-06-17
This format, where each variable is in it’s own column, is much easier to work with!
stormsB <- (stormsA$pressure / stormsA$wind)
stormsB
## [1] 9.154545 22.422222 15.461538 25.325000 20.200000 22.444444
table1
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
Notice that each column is a variable.
table1
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
library(ggplot2)
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), color = "gray30") +
geom_point(aes(color = country)) +
theme_minimal()
FAO3 <- FAO2 %>% filter(Item == "Cassava")
library(ggplot2)
ggplot(FAO3, aes(Year, Value)) +
geom_point(aes(color = Country)) +
ylab("Area planted (ha)") +
theme_minimal()
Let’s look at the “cases” dataset from earlier
Let’s ‘gather’ the ‘cases’ dataset!
cases
## country 2011 2012 2013
## 1 FR 7000 6900 7000
## 2 DE 5800 6000 6200
## 3 US 15000 14000 13000
cases2 <- gather(cases, key = "year", value = "n", 2:4)
cases2
## country year n
## 1 FR 2011 7000
## 2 DE 2011 5800
## 3 US 2011 15000
## 4 FR 2012 6900
## 5 DE 2012 6000
## 6 US 2012 14000
## 7 FR 2013 7000
## 8 DE 2013 6200
## 9 US 2013 13000
cases3 <- cases %>% gather(key = "year", value = "n", "2011", "2012", "2013")
Let’s ‘spread’ the pollution dataset
pollution
## city size amount
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
pollution2 <- spread(pollution, size, amount)
pollution2
## city large small
## 1 Beijing 121 56
## 2 London 22 16
## 3 New York 23 14
stormsA
## # tibble [6 × 4]
## storm wind pressure date
## <chr> <int> <int> <date>
## 1 Alberto 110 1007 2000-08-03
## 2 Alex 45 1009 1998-07-27
## 3 Allison 65 1005 1995-06-03
## 4 Ana 40 1013 1997-06-30
## 5 Arlene 50 1010 1999-06-11
## 6 Arthur 45 1010 1996-06-17
stormsC <- separate(stormsA, date, into = c("year", "month", "day"), sep = "-")
stormsC
## # tibble [6 × 6]
## storm wind pressure year month day
## <chr> <int> <int> <chr> <chr> <chr>
## 1 Alberto 110 1007 2000 08 03
## 2 Alex 45 1009 1998 07 27
## 3 Allison 65 1005 1995 06 03
## 4 Ana 40 1013 1997 06 30
## 5 Arlene 50 1010 1999 06 11
## 6 Arthur 45 1010 1996 06 17
stormsD <- stormsC %>% unite("date", year, month, day, sep = "-")
stormsD
## # A tibble: 6 x 4
## storm wind pressure date
## <chr> <int> <int> <chr>
## 1 Alberto 110 1007 2000-08-03
## 2 Alex 45 1009 1998-07-27
## 3 Allison 65 1005 1995-06-03
## 4 Ana 40 1013 1997-06-30
## 5 Arlene 50 1010 1999-06-11
## 6 Arthur 45 1010 1996-06-17