This is a lab to practice with Google Sheets and the
googlesheets
package.
Make sure you go through the authentication process. You’ll see a popup and will need to grant permissions.
gs4_auth()
sheet_url <- "https://docs.google.com/spreadsheets/d/1KIRtcPVn58R3_qr97WNtcOJiY4AaytHzGDzLW_3_R1s/edit?usp=sharing"
read_sheet()
function to read in the data like
we discussed in class, call this object x
.x <- read_sheet(sheet_url)
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for
## 'avamariehoffman@gmail.com'.
## ✔ Reading from "data_wrangling_2019".
## ✔ Range 'Sheet1'.
sheet_names
function to get the sheet names from
the sheet_url
.sheet_names(sheet_url)
## [1] "Sheet1" "Copy of Sheet1"
range = cell_cols()
argument.read_sheet(sheet_url, range = cell_cols("A:B"))
## ✔ Reading from "data_wrangling_2019".
## ✔ Range 'A:B'.
## # A tibble: 35 × 2
## `UW?` `R user (yes or no)`
## <chr> <chr>
## 1 no yes
## 2 no yes
## 3 no yes
## 4 no yes
## 5 no yes
## 6 No YEs but sparse
## 7 no yes
## 8 No Yes
## 9 No Yes
## 10 <NA> <NA>
## # ℹ 25 more rows
x
?
Hint: use a stringr
function.x %>%
filter(str_detect(`Why are you taking this module (free text)`, pattern = "learn"))
## # A tibble: 12 × 7
## `UW?` `R user (yes or no)` `Years of R experience` `Github User`
## <chr> <chr> <list> <chr>
## 1 no yes <dbl [1]> yes
## 2 no yes <dbl [1]> sometimes
## 3 no yes <dbl [1]> no
## 4 no yes <dbl [1]> sparsely
## 5 No YEs but sparse <dbl [1]> no
## 6 no not regularly <dbl [1]> no
## 7 yes meh <chr [1]> meh
## 8 no no <dbl [1]> no
## 9 no no <chr [1]> no
## 10 no future user <chr [1]> no
## 11 no yes <dbl [1]> not really
## 12 yes sort of <chr [1]> sort of
## # ℹ 3 more variables: `Years of Github Experience` <list>,
## # `Why are you taking this module (free text)` <chr>,
## # `Additional comments` <chr>
The following dataset lists airports in the US and details about the number of late flights over time.
jsonData
.jsonData <- fromJSON("https://think.cs.vt.edu/corgis/datasets/json/airlines/airlines.json")
jsonData
?str(jsonData)
## 'data.frame': 4408 obs. of 3 variables:
## $ Airport :'data.frame': 4408 obs. of 2 variables:
## ..$ Code: chr "ATL" "BOS" "BWI" "CLT" ...
## ..$ Name: chr "Atlanta, GA: Hartsfield-Jackson Atlanta International" "Boston, MA: Logan International" "Baltimore, MD: Baltimore/Washington International Thurgood Marshall" "Charlotte, NC: Charlotte Douglas International" ...
## $ Time :'data.frame': 4408 obs. of 4 variables:
## ..$ Label : chr "2003/06" "2003/06" "2003/06" "2003/06" ...
## ..$ Month : int 6 6 6 6 6 6 6 6 6 6 ...
## ..$ Month Name: chr "June" "June" "June" "June" ...
## ..$ Year : int 2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
## $ Statistics:'data.frame': 4408 obs. of 4 variables:
## ..$ # of Delays :'data.frame': 4408 obs. of 5 variables:
## .. ..$ Carrier : int 1009 374 296 300 283 516 986 376 322 247 ...
## .. ..$ Late Aircraft : int 1275 495 477 472 268 323 1390 371 519 256 ...
## .. ..$ National Aviation System: int 3217 685 389 735 487 664 2147 570 1948 427 ...
## .. ..$ Security : int 17 3 8 2 4 11 19 6 10 5 ...
## .. ..$ Weather : int 328 66 78 54 58 98 258 71 121 34 ...
## ..$ Carriers :'data.frame': 4408 obs. of 2 variables:
## .. ..$ Names: chr "American Airlines Inc.,JetBlue Airways,Continental Air Lines Inc.,Delta Air Lines Inc.,Atlantic Southeast Airli"| __truncated__ "American Airlines Inc.,Alaska Airlines Inc.,Continental Air Lines Inc.,Atlantic Coast Airlines,Delta Air Lines "| __truncated__ "American Airlines Inc.,Continental Air Lines Inc.,Delta Air Lines Inc.,AirTran Airways Corporation,America West"| __truncated__ "American Airlines Inc.,Continental Air Lines Inc.,Atlantic Coast Airlines,Delta Air Lines Inc.,Atlantic Southea"| __truncated__ ...
## .. ..$ Total: int 11 14 11 11 13 13 13 11 13 11 ...
## ..$ Flights :'data.frame': 4408 obs. of 5 variables:
## .. ..$ Cancelled: int 216 138 29 73 74 34 394 123 102 13 ...
## .. ..$ Delayed : int 5843 1623 1245 1562 1100 1611 4798 1395 2921 967 ...
## .. ..$ Diverted : int 27 3 15 14 18 22 133 9 42 10 ...
## .. ..$ On Time : int 23974 7875 6998 7021 5321 10024 22303 10227 9441 3672 ...
## .. ..$ Total : int 30060 9639 8287 8670 6513 11691 27628 11754 12506 4662 ...
## ..$ Minutes Delayed:'data.frame': 4408 obs. of 6 variables:
## .. ..$ Carrier : int 61606 20319 13635 14763 13775 26634 70918 21802 20190 12547 ...
## .. ..$ Late Aircraft : int 68335 28189 26810 23379 13712 18969 80714 18715 30905 14600 ...
## .. ..$ National Aviation System: int 118831 24400 17556 23804 20999 23538 90574 16482 91048 14935 ...
## .. ..$ Security : int 518 99 278 127 120 706 683 139 490 141 ...
## .. ..$ Total : int 268764 77167 64480 65865 52747 75428 263521 60667 150513 44303 ...
## .. ..$ Weather : int 19474 4160 6201 3792 4141 5581 20632 3529 7880 2080 ...
# Airport, Time and Statistics
dim(jsonData$Airport)
## [1] 4408 2
dim(jsonData$Time)
## [1] 4408 4
dim(jsonData$Statistics)
## [1] 4408 4
colnames(jsonData$Statistics)
## [1] "# of Delays" "Carriers" "Flights" "Minutes Delayed"
jsonData
to include only Year : 2016. Call this
air_2016
.air_2016 <- jsonData %>%
filter(Time$Year == 2016)
air_2016$Airport %>% count()
## n
## 1 29
air_2016
to include only NYC LaGuardia and
Chicago O’Hare airports (Code : “LGA”, “ORD”). Call this
lga_ord
.lga_ord <- jsonData %>%
filter(Airport$Code %in% c("LGA", "ORD"))
Bonus Practice
Create a new tibble out of Airport Code (Airport$Code
),
Total Flights (Statistics$Flights$Total
), and create a new
column for proportion of delayed flights using
(Statistics$Flights$Delayed
). Call this new tibble
airport_compare
.
airport_list <- list(
airport_code = lga_ord$Airport$Code,
total_flights = lga_ord$Statistics$Flights$Total,
delayed_proportion = lga_ord$Statistics$Flights$Delayed / lga_ord$Statistics$Flights$Total
)
airport_compare <- as_tibble(airport_list)
airport_compare
## # A tibble: 304 × 3
## airport_code total_flights delayed_proportion
## <chr> <int> <dbl>
## 1 LGA 8908 0.180
## 2 ORD 30195 0.162
## 3 LGA 9294 0.208
## 4 ORD 31345 0.232
## 5 LGA 9302 0.265
## 6 ORD 31121 0.200
## 7 LGA 8771 0.192
## 8 ORD 29638 0.165
## 9 LGA 9319 0.148
## 10 ORD 30990 0.147
## # ℹ 294 more rows