Kevin Vo bio photo

Kevin Vo

Data Enthusiast, works in R and Python

Email Facebook LinkedIn Github
> # Load the dplyr package
> library(dplyr)
>
> # Load the hflights package
> library(hflights)
>
> # Call both head() and summary() on hflights
> head(hflights)
     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
5424 2011     1          1         6    1400    1500            AA       428
5425 2011     1          2         7    1401    1501            AA       428
5426 2011     1          3         1    1352    1502            AA       428
5427 2011     1          4         2    1403    1513            AA       428
5428 2011     1          5         3    1405    1507            AA       428
5429 2011     1          6         4    1359    1503            AA       428
     TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
5424  N576AA                60      40      -10        0    IAH  DFW      224
5425  N557AA                60      45       -9        1    IAH  DFW      224
5426  N541AA                70      48       -8       -8    IAH  DFW      224
5427  N403AA                70      39        3        3    IAH  DFW      224
5428  N492AA                62      44       -3        5    IAH  DFW      224
5429  N262AA                64      45       -7       -1    IAH  DFW      224
     TaxiIn TaxiOut Cancelled CancellationCode Diverted
5424      7      13         0                         0
5425      6       9         0                         0
5426      5      17         0                         0
5427      9      22         0                         0
5428      9       9         0                         0
5429      6      13         0                         0
> summary(hflights)
      Year          Month          DayofMonth      DayOfWeek        DepTime
 Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000   Min.   :   1
 1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000   1st Qu.:1021
 Median :2011   Median : 7.000   Median :16.00   Median :4.000   Median :1416
 Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948   Mean   :1396
 3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:1801
 Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000   Max.   :2400
                                                                 NA's   :2905
    ArrTime     UniqueCarrier        FlightNum      TailNum
 Min.   :   1   Length:227496      Min.   :   1   Length:227496
 1st Qu.:1215   Class :character   1st Qu.: 855   Class :character
 Median :1617   Mode  :character   Median :1696   Mode  :character
 Mean   :1578                      Mean   :1962
 3rd Qu.:1953                      3rd Qu.:2755
 Max.   :2400                      Max.   :7290
 NA's   :3066
 ActualElapsedTime    AirTime         ArrDelay          DepDelay
 Min.   : 34.0     Min.   : 11.0   Min.   :-70.000   Min.   :-33.000
 1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000   1st Qu.: -3.000
 Median :128.0     Median :107.0   Median :  0.000   Median :  0.000
 Mean   :129.3     Mean   :108.1   Mean   :  7.094   Mean   :  9.445
 3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000   3rd Qu.:  9.000
 Max.   :575.0     Max.   :549.0   Max.   :978.000   Max.   :981.000
 NA's   :3622      NA's   :3622    NA's   :3622      NA's   :2905
    Origin              Dest              Distance          TaxiIn
 Length:227496      Length:227496      Min.   :  79.0   Min.   :  1.000
 Class :character   Class :character   1st Qu.: 376.0   1st Qu.:  4.000
 Mode  :character   Mode  :character   Median : 809.0   Median :  5.000
                                       Mean   : 787.8   Mean   :  6.099
                                       3rd Qu.:1042.0   3rd Qu.:  7.000
                                       Max.   :3904.0   Max.   :165.000
                                                        NA's   :3066
    TaxiOut         Cancelled       CancellationCode      Diverted
 Min.   :  1.00   Min.   :0.00000   Length:227496      Min.   :0.000000
 1st Qu.: 10.00   1st Qu.:0.00000   Class :character   1st Qu.:0.000000
 Median : 14.00   Median :0.00000   Mode  :character   Median :0.000000
 Mean   : 15.09   Mean   :0.01307                      Mean   :0.002853
 3rd Qu.: 18.00   3rd Qu.:0.00000                      3rd Qu.:0.000000
 Max.   :163.00   Max.   :1.00000                      Max.   :1.000000
 NA's   :2947
> # Both the dplyr and hflights packages are loaded
>
> # Convert the hflights data.frame into a hflights tbl
> hflights = tbl_df(hflights)
>
> # Display the hflights tbl
> hflights
Source: local data frame [22,751 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011    11         20         7    1840    2127            FL       296
2  2011    10          2         7    1733    1825            XE      4594
3  2011     8         20         6    1540    1630            CO        35
4  2011     9         22         4    1536    1827            XE      2122
5  2011     8          7         7    2130    2228            OO      5839
6  2011     4         30         6     744     837            WN      1493
7  2011     4          5         2    1621    1812            XE      2346
8  2011    12         29         4     832     953            WN       683
9  2011     1         28         5    1218    1337            CO      1629
10 2011     7         12         2    1603    1657            WN        40
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
>
> # Create the object carriers, containing only the UniqueCarrier variable of hflights
> carriers = hflights$UniqueCarrier
~~~


~~~r
> # Both the dplyr and hflights packages are loaded into workspace
> lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
         "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
         "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
         "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
>
> # Use lut to translate the UniqueCarrier column of hflights
> hflights$UniqueCarrier = lut[hflights$UniqueCarrier]
>
> # Inspect the resulting raw values of your variables
> glimpse(hflights)
Observations: 22751
Variables:
$ Year              (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2...
$ Month             (int) 11, 10, 8, 9, 8, 4, 4, 12, 1, 7, 3, 11, 10, 12, 1...
$ DayofMonth        (int) 20, 2, 20, 22, 7, 30, 5, 29, 28, 12, 8, 27, 28, 2...
$ DayOfWeek         (int) 7, 7, 6, 4, 7, 6, 2, 4, 5, 2, 2, 7, 5, 3, 1, 4, 7...
$ DepTime           (int) 1840, 1733, 1540, 1536, 2130, 744, 1621, 832, 121...
$ ArrTime           (int) 2127, 1825, 1630, 1827, 2228, 837, 1812, 953, 133...
$ UniqueCarrier     (chr) "AirTran", "ExpressJet", "Continental", "ExpressJ...
$ FlightNum         (int) 296, 4594, 35, 2122, 5839, 1493, 2346, 683, 1629,...
$ TailNum           (chr) "N329AT", "N15941", "N16646", "N14125", "N952SW",...
$ ActualElapsedTime (int) 107, 52, 50, 171, 58, 53, 111, 201, 199, 54, 178,...
$ AirTime           (int) 88, 38, 27, 140, 40, 43, 89, 186, 177, 41, 156, 4...
$ ArrDelay          (int) -8, -8, 4, 3, -15, -8, 25, -17, -30, -3, 9, 13, 9...
$ DepDelay          (int) -4, -3, 0, 1, 0, -1, 16, 7, -2, 3, 8, 11, -6, 2, ...
$ Origin            (chr) "HOU", "IAH", "IAH", "IAH", "IAH", "HOU", "IAH", ...
$ Dest              (chr) "ATL", "CRP", "AUS", "MSP", "DFW", "MSY", "HSV", ...
$ Distance          (int) 696, 201, 140, 1034, 224, 303, 595, 1390, 1347, 2...
$ TaxiIn            (int) 11, 3, 6, 12, 4, 3, 4, 7, 7, 4, 7, 4, 5, 9, 3, NA...
$ TaxiOut           (int) 8, 11, 17, 19, 14, 7, 18, 8, 15, 9, 15, 11, 25, 1...
$ Cancelled         (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0...
$ CancellationCode  (chr) "", "", "", "", "", "", "", "", "", "", "", "", "...
$ Diverted          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
>
> # The hflights tbl you built in the previous exercise is available in the workspace.
> 
> # Build the lookup table: lut
> lut= c("A"= "carrier",
        "B" = "weather",
        "C" = "FFA",
        "D" = "security",
        "E" = "not cancelled")
> # Use the lookup table to create a vector of code labels. Assign the vector to the CancellationCode column of hflights
> hflights$CancellationCode = lut[hflights$CancellationCode]
> 
> 
> # Inspect the resulting raw values of your variables
> glimpse(hflights)
Observations: 22751
Variables:
$ Year              (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2...
$ Month             (int) 11, 10, 8, 9, 8, 4, 4, 12, 1, 7, 3, 11, 10, 12, 1...
$ DayofMonth        (int) 20, 2, 20, 22, 7, 30, 5, 29, 28, 12, 8, 27, 28, 2...
$ DayOfWeek         (int) 7, 7, 6, 4, 7, 6, 2, 4, 5, 2, 2, 7, 5, 3, 1, 4, 7...
$ DepTime           (int) 1840, 1733, 1540, 1536, 2130, 744, 1621, 832, 121...
$ ArrTime           (int) 2127, 1825, 1630, 1827, 2228, 837, 1812, 953, 133...
$ UniqueCarrier     (chr) "AirTran", "ExpressJet", "Continental", "ExpressJ...
$ FlightNum         (int) 296, 4594, 35, 2122, 5839, 1493, 2346, 683, 1629,...
$ TailNum           (chr) "N329AT", "N15941", "N16646", "N14125", "N952SW",...
$ ActualElapsedTime (int) 107, 52, 50, 171, 58, 53, 111, 201, 199, 54, 178,...
$ AirTime           (int) 88, 38, 27, 140, 40, 43, 89, 186, 177, 41, 156, 4...
$ ArrDelay          (int) -8, -8, 4, 3, -15, -8, 25, -17, -30, -3, 9, 13, 9...
$ DepDelay          (int) -4, -3, 0, 1, 0, -1, 16, 7, -2, 3, 8, 11, -6, 2, ...
$ Origin            (chr) "HOU", "IAH", "IAH", "IAH", "IAH", "HOU", "IAH", ...
$ Dest              (chr) "ATL", "CRP", "AUS", "MSP", "DFW", "MSY", "HSV", ...
$ Distance          (int) 696, 201, 140, 1034, 224, 303, 595, 1390, 1347, 2...
$ TaxiIn            (int) 11, 3, 6, 12, 4, 3, 4, 7, 7, 4, 7, 4, 5, 9, 3, NA...
$ TaxiOut           (int) 8, 11, 17, 19, 14, 7, 18, 8, 15, 9, 15, 11, 25, 1...
$ Cancelled         (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0...
$ CancellationCode  (chr) "not cancelled", "not cancelled", "not cancelled"...
$ Diverted          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
> 
Figure. Dplyr 5 Verbs
> # hflights is pre-loaded as a tbl, together with the necessary libraries.
> 
> # Print out a tbl with the four columns of hflights related to delay
> select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
Source: local data frame [22,751 x 4]

   ActualElapsedTime AirTime ArrDelay DepDelay
1                107      88       -8       -4
2                 52      38       -8       -3
3                 50      27        4        0
4                171     140        3        1
5                 58      40      -15        0
6                 53      43       -8       -1
7                111      89       25       16
8                201     186      -17        7
9                199     177      -30       -2
10                54      41       -3        3
..               ...     ...      ...      ...
> 
> # Print out hflights, nothing has changed!
> hflights
Source: local data frame [22,751 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011    11         20         7    1840    2127            FL       296
2  2011    10          2         7    1733    1825            XE      4594
3  2011     8         20         6    1540    1630            CO        35
4  2011     9         22         4    1536    1827            XE      2122
5  2011     8          7         7    2130    2228            OO      5839
6  2011     4         30         6     744     837            WN      1493
7  2011     4          5         2    1621    1812            XE      2346
8  2011    12         29         4     832     953            WN       683
9  2011     1         28         5    1218    1337            CO      1629
10 2011     7         12         2    1603    1657            WN        40
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # Print out the columns Origin up to Cancelled of hflights
> select(hflights, Origin:Cancelled)
Source: local data frame [22,751 x 6]

   Origin Dest Distance TaxiIn TaxiOut Cancelled
1     HOU  ATL      696     11       8         0
2     IAH  CRP      201      3      11         0
3     IAH  AUS      140      6      17         0
4     IAH  MSP     1034     12      19         0
5     IAH  DFW      224      4      14         0
6     HOU  MSY      303      3       7         0
7     IAH  HSV      595      4      18         0
8     HOU  LAX     1390      7       8         0
9     IAH  SNA     1347      7      15         0
10    HOU  DAL      239      4       9         0
..    ...  ...      ...    ...     ...       ...
> 
> # Answer to last question: be concise!
> select(hflights, -DepTime, -ArrTime, -UniqueCarrier, -FlightNum, -TailNum, -ActualElapsedTime, -AirTime)
Source: local data frame [22,751 x 14]

   Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest Distance
1  2011    11         20         7       -8       -4    HOU  ATL      696
2  2011    10          2         7       -8       -3    IAH  CRP      201
3  2011     8         20         6        4        0    IAH  AUS      140
4  2011     9         22         4        3        1    IAH  MSP     1034
5  2011     8          7         7      -15        0    IAH  DFW      224
6  2011     4         30         6       -8       -1    HOU  MSY      303
7  2011     4          5         2       25       16    IAH  HSV      595
8  2011    12         29         4      -17        7    HOU  LAX     1390
9  2011     1         28         5      -30       -2    IAH  SNA     1347
10 2011     7         12         2       -3        3    HOU  DAL      239
..  ...   ...        ...       ...      ...      ...    ...  ...      ...
Variables not shown: TaxiIn (int), TaxiOut (int), Cancelled (int),
  CancellationCode (chr), Diverted (int)
> 
> # As usual, hflights is pre-loaded as a tbl, together with the necessary libraries.
> 
> # Print out a tbl containing just ArrDelay and DepDelay
> select(hflights, contains("Delay"))
Source: local data frame [22,751 x 2]

   ArrDelay DepDelay
1        -8       -4
2        -8       -3
3         4        0
4         3        1
5       -15        0
6        -8       -1
7        25       16
8       -17        7
9       -30       -2
10       -3        3
..      ...      ...
> 
> # Print out a tbl as described in the second instruction, using both helper functions and variable names
> select(hflights, starts_with("Unique"), contains("Num"), contains("Cancel"))
Source: local data frame [22,751 x 5]

   UniqueCarrier FlightNum TailNum Cancelled CancellationCode
1             FL       296  N329AT         0                 
2             XE      4594  N15941         0                 
3             CO        35  N16646         0                 
4             XE      2122  N14125         0                 
5             OO      5839  N952SW         0                 
6             WN      1493  N785SW         0                 
7             XE      2346  N13994         0                 
8             WN       683  N415WN         0                 
9             CO      1629  N15710         0                 
10            WN        40  N614SW         0                 
..           ...       ...     ...       ...              ...
> 
> # Print out a tbl as described in the third instruction, using only helper functions.
> select(hflights,contains("Time"), contains("Delay"))
Source: local data frame [22,751 x 6]

   DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
1     1840    2127               107      88       -8       -4
2     1733    1825                52      38       -8       -3
3     1540    1630                50      27        4        0
4     1536    1827               171     140        3        1
5     2130    2228                58      40      -15        0
6      744     837                53      43       -8       -1
7     1621    1812               111      89       25       16
8      832     953               201     186      -17        7
9     1218    1337               199     177      -30       -2
10    1603    1657                54      41       -3        3
..     ...     ...               ...     ...      ...      ...
> 
> # both hflights and dplyr are available
> 
> ex1r <- hflights[c("TaxiIn","TaxiOut","Distance")]
> ex1d <- select(hflights, TaxiIn, TaxiOut, Distance)
> ex1r
Source: local data frame [22,751 x 3]

   TaxiIn TaxiOut Distance
1      11       8      696
2       3      11      201
3       6      17      140
4      12      19     1034
5       4      14      224
6       3       7      303
7       4      18      595
8       7       8     1390
9       7      15     1347
10      4       9      239
..    ...     ...      ...
> ex1d
Source: local data frame [22,751 x 3]

   TaxiIn TaxiOut Distance
1      11       8      696
2       3      11      201
3       6      17      140
4      12      19     1034
5       4      14      224
6       3       7      303
7       4      18      595
8       7       8     1390
9       7      15     1347
10      4       9      239
..    ...     ...      ...
> 
> ex2r <- hflights[c("Year","Month","DayOfWeek","DepTime","ArrTime")]
> ex2d <- select(hflights, Year, Month, DayOfWeek, DepTime, ArrTime)
> ex2r
Source: local data frame [22,751 x 5]

   Year Month DayOfWeek DepTime ArrTime
1  2011    11         7    1840    2127
2  2011    10         7    1733    1825
3  2011     8         6    1540    1630
4  2011     9         4    1536    1827
5  2011     8         7    2130    2228
6  2011     4         6     744     837
7  2011     4         2    1621    1812
8  2011    12         4     832     953
9  2011     1         5    1218    1337
10 2011     7         2    1603    1657
..  ...   ...       ...     ...     ...
> ex2d
Source: local data frame [22,751 x 5]

   Year Month DayOfWeek DepTime ArrTime
1  2011    11         7    1840    2127
2  2011    10         7    1733    1825
3  2011     8         6    1540    1630
4  2011     9         4    1536    1827
5  2011     8         7    2130    2228
6  2011     4         6     744     837
7  2011     4         2    1621    1812
8  2011    12         4     832     953
9  2011     1         5    1218    1337
10 2011     7         2    1603    1657
..  ...   ...       ...     ...     ...
> 
> ex3r <- hflights[c("TailNum","TaxiIn","TaxiOut")]
> ex3d <- select(hflights, TailNum, TaxiIn, TaxiOut)
> ex3r
Source: local data frame [22,751 x 3]

   TailNum TaxiIn TaxiOut
1   N329AT     11       8
2   N15941      3      11
3   N16646      6      17
4   N14125     12      19
5   N952SW      4      14
6   N785SW      3       7
7   N13994      4      18
8   N415WN      7       8
9   N15710      7      15
10  N614SW      4       9
..     ...    ...     ...
> ex3d
Source: local data frame [22,751 x 3]

   TailNum TaxiIn TaxiOut
1   N329AT     11       8
2   N15941      3      11
3   N16646      6      17
4   N14125     12      19
5   N952SW      4      14
6   N785SW      3       7
7   N13994      4      18
8   N415WN      7       8
9   N15710      7      15
10  N614SW      4       9
..     ...    ...     ...
> 
> # hflights and dplyr are loaded and ready to serve you.
> 
> # Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
> g1 = mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)
> g1
Source: local data frame [22,751 x 22]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011    11         20         7    1840    2127            FL       296
2  2011    10          2         7    1733    1825            XE      4594
3  2011     8         20         6    1540    1630            CO        35
4  2011     9         22         4    1536    1827            XE      2122
5  2011     8          7         7    2130    2228            OO      5839
6  2011     4         30         6     744     837            WN      1493
7  2011     4          5         2    1621    1812            XE      2346
8  2011    12         29         4     832     953            WN       683
9  2011     1         28         5    1218    1337            CO      1629
10 2011     7         12         2    1603    1657            WN        40
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int), ActualGroundTime (int)
  
> select(g1, ActualElapsedTime, AirTime, ActualGroundTime)
Source: local data frame [22,751 x 3]

   ActualElapsedTime AirTime ActualGroundTime
1                107      88               19
2                 52      38               14
3                 50      27               23
4                171     140               31
5                 58      40               18
6                 53      43               10
7                111      89               22
8                201     186               15
9                199     177               22
10                54      41               13
..               ...     ...              ...
> 
> # Add the new variable GroundTime to a g1. Save the result as g2.
> g2 = mutate(g1, GroundTime = TaxiIn + TaxiOut)
> select(g2, TaxiIn, TaxiOut, GroundTime)
Source: local data frame [22,751 x 3]

   TaxiIn TaxiOut GroundTime
1      11       8         19
2       3      11         14
3       6      17         23
4      12      19         31
5       4      14         18
6       3       7         10
7       4      18         22
8       7       8         15
9       7      15         22
10      4       9         13
..    ...     ...        ...
> 
> # Add the new variable AverageSpeed to g2. Save the result as g3.
> g3 = mutate(g2, AverageSpeed = Distance / AirTime * 60)
> select(g3, Distance, AirTime, AverageSpeed)
Source: local data frame [22,751 x 3]

   Distance AirTime AverageSpeed
1       696      88     474.5455
2       201      38     317.3684
3       140      27     311.1111
4      1034     140     443.1429
5       224      40     336.0000
6       303      43     422.7907
7       595      89     401.1236
8      1390     186     448.3871
9      1347     177     456.6102
10      239      41     349.7561
..      ...     ...          ...
> 
> # Print out g3
> g3
Source: local data frame [22,751 x 24]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011    11         20         7    1840    2127            FL       296
2  2011    10          2         7    1733    1825            XE      4594
3  2011     8         20         6    1540    1630            CO        35
4  2011     9         22         4    1536    1827            XE      2122
5  2011     8          7         7    2130    2228            OO      5839
6  2011     4         30         6     744     837            WN      1493
7  2011     4          5         2    1621    1812            XE      2346
8  2011    12         29         4     832     953            WN       683
9  2011     1         28         5    1218    1337            CO      1629
10 2011     7         12         2    1603    1657            WN        40
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int), ActualGroundTime (int), GroundTime (int), AverageSpeed (dbl)
> 
> # hflights and dplyr are ready, are you?
> 
> # Add a second variable loss_percent to the dataset: m1
> m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_percent = (ArrDelay- DepDelay)/DepDelay * 100)
> select(m1, ArrDelay, DepDelay, loss, loss_percent)
Source: local data frame [22,751 x 4]

   ArrDelay DepDelay loss loss_percent
1        -8       -4   -4     100.0000
2        -8       -3   -5     166.6667
3         4        0    4          Inf
4         3        1    2     200.0000
5       -15        0  -15         -Inf
6        -8       -1   -7     700.0000
7        25       16    9      56.2500
8       -17        7  -24    -342.8571
9       -30       -2  -28    1400.0000
10       -3        3   -6    -200.0000
..      ...      ...  ...          ...
> 
> # Copy and adapt the previous command to reduce redendancy: m2
> m2 = mutate(m1, loss_percent = 100*loss/DepDelay )
> select(m2, ArrDelay, DepDelay, loss_percent)
Source: local data frame [22,751 x 3]

   ArrDelay DepDelay loss_percent
1        -8       -4     100.0000
2        -8       -3     166.6667
3         4        0          Inf
4         3        1     200.0000
5       -15        0         -Inf
6        -8       -1     700.0000
7        25       16      56.2500
8       -17        7    -342.8571
9       -30       -2    1400.0000
10       -3        3    -200.0000
..      ...      ...          ...
> 
> # Add the three variables as described in the third instruction: m3
> m3 = mutate(hflights, TotalTaxi = TaxiIn + TaxiOut, ActualGroundTime = ActualElapsedTime -AirTime, Diff = TotalTaxi - ActualGroundTime)
> select(m3, TaxiIn, TaxiOut, TotalTaxi, ActualElapsedTime, AirTime, ActualGroundTime, Diff)
Source: local data frame [22,751 x 7]

   TaxiIn TaxiOut TotalTaxi ActualElapsedTime AirTime ActualGroundTime Diff
1      11       8        19               107      88               19    0
2       3      11        14                52      38               14    0
3       6      17        23                50      27               23    0
4      12      19        31               171     140               31    0
5       4      14        18                58      40               18    0
6       3       7        10                53      43               10    0
7       4      18        22               111      89               22    0
8       7       8        15               201     186               15    0
9       7      15        22               199     177               22    0
10      4       9        13                54      41               13    0
..    ...     ...       ...               ...     ...              ...  ...
Figure. Filter
> # All flights that traveled 3000 miles or more
> filter(hflights, Distance > 3000)
Source: local data frame [62 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011    11         27         7     944    1357   Continental         1
2  2011     3          7         1     929    1348   Continental         1
3  2011     1         25         2     924    1337   Continental         1
4  2011    12         24         6     937    1410   Continental         1
5  2011     4         26         2     934    1249   Continental         1
6  2011    11         30         3     940    1407   Continental         1
7  2011     9         26         1     951    1250   Continental         1
8  2011     3         31         4    1013    1304   Continental         1
9  2011     4          8         5     940    1307   Continental         1
10 2011     5         12         4    1511    1902   Continental       407
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # All flights flown by one of JetBlue, Southwest, or Delta
> filter(hflights, UniqueCarrier == "JetBlue" | UniqueCarrier == "Southwest" | UniqueCarrier == "Delta")
Source: local data frame [4,885 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011     4         30         6     744     837     Southwest      1493
2  2011    12         29         4     832     953     Southwest       683
3  2011     7         12         2    1603    1657     Southwest        40
4  2011    11         27         7    1621    1723     Southwest        33
5  2011     6          5         7     954    1309     Southwest         7
6  2011     6         24         5    1415    1503     Southwest        32
7  2011     2         26         6    1538    1951       JetBlue       624
8  2011    11         27         7    1603    1908         Delta      1846
9  2011    10          9         7    1232    1423     Southwest      2492
10 2011    12         30         5    1944    2045     Southwest      3499
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # All flights where taxiing took longer than flying
> filter(hflights, TaxiIn + TaxiOut > AirTime)
Source: local data frame [145 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011     7          8         5    1248    1348   Continental      1467
2  2011     8         14         7    1751    1913   Continental      1779
3  2011    10         17         1    1135    1246    ExpressJet      4268
4  2011    11         15         2    1334    1841   Continental      1561
5  2011     3         24         4    2112    2221   Continental      1533
6  2011     3         18         5    1812    1914   Continental        35
7  2011     6         22         3    1317    1501      American      1534
8  2011    11         15         2    1616    1751    ExpressJet      4559
9  2011    11         29         2    1800    1915   Continental      1145
10 2011     2          4         5    2114    2228   Continental      1533
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
>  
> # All flights that departed before 5am or arrived after 10pm
> filter(hflights, DepTime < 500  | ArrTime > 2200)
Source: local data frame [2,774 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011     8          7         7    2130    2228            OO      5839
2  2011     1         31         1    2110    2203            XE      2024
3  2011     3         11         5    2114    2343            CO       467
4  2011    10         20         4    1801    2229            CO      1524
5  2011     7          7         4    1914    2222            XE      3050
6  2011     1          3         1    2125    2243            CO      1717
7  2011     4         24         7    1951    2319            XE      2094
8  2011    10          5         3    2049    2202            CO      1530
9  2011     7         22         5    2023    2336            XE      2288
10 2011     2         18         5    1928    2201            OO      5816
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # All flights that departed late but arrived ahead of schedule
> filter(hflights, DepDelay > 0 & ArrDelay < 0)
Source: local data frame [2,778 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011    12         29         4     832     953            WN       683
2  2011     7         12         2    1603    1657            WN        40
3  2011    12         28         3    1332    1636            CO      1151
4  2011     2         26         6    1538    1951            B6       624
5  2011    12         21         3     914    1038            CO      1257
6  2011    11         11         5     731    1047            CO      1160
7  2011    12         30         5    1944    2045            WN      3499
8  2011     8         24         3     948    1101            OO      2001
9  2011     1         16         7    1541    1701            CO       795
10 2011     3          7         1    1735    2003            CO       667
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # All cancelled weekend flights
> filter(hflights, DayOfWeek %in% c(6,7) & Cancelled == 1)
Source: local data frame [55 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011     4         17         7      NA      NA            WN        42
2  2011     2          6         7      NA      NA            XE      2570
3  2011     4          2         6      NA      NA            WN      3823
4  2011     2          6         7      NA      NA            XE      2902
5  2011    10          9         7      NA      NA            XE      4289
6  2011    12         11         7    1303      NA            UA       399
7  2011     8         27         6      NA      NA            CO      1451
8  2011     5          1         7      NA      NA            OO      5814
9  2011    12         31         6      NA      NA            XE      4542
10 2011     3          6         7      NA      NA            XE      2928
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # All flights that were cancelled after being delayed
> filter(hflights, DepDelay > 0 & Cancelled == 1)
Source: local data frame [2 x 21]

  Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1 2011     9         16         5    1234      NA            OO      2059
2 2011     1         26         3    1926      NA            CO       310
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # Select the flights that had JFK as their destination: c1
> c1 = filter(hflights, Dest == "JFK")
> c1
Source: local data frame [72 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011     2         26         6    1538    1951            B6       624
2  2011     8         17         3     623    1116            B6       620
3  2011     9         23         5     645    1046            B6       620
4  2011     2         23         3    1546    2002            B6       624
5  2011     7         25         1    1759    2307            B6       622
6  2011     8         11         4     627    1040            B6       620
7  2011     8          5         5     627    1058            B6       620
8  2011     6         20         1     656    1108            B6       620
9  2011     1         12         3    1532    1953            B6       624
10 2011     4          8         5    1526    1937            B6       624
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # Combine the Year, Month and DayofMonth variables to create a Date column: c2
> c2 = mutate(c1, Date = paste(Year, Month, DayofMonth, sep = "-"))
> c2
Source: local data frame [72 x 22]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011     2         26         6    1538    1951            B6       624
2  2011     8         17         3     623    1116            B6       620
3  2011     9         23         5     645    1046            B6       620
4  2011     2         23         3    1546    2002            B6       624
5  2011     7         25         1    1759    2307            B6       622
6  2011     8         11         4     627    1040            B6       620
7  2011     8          5         5     627    1058            B6       620
8  2011     6         20         1     656    1108            B6       620
9  2011     1         12         3    1532    1953            B6       624
10 2011     4          8         5    1526    1937            B6       624
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int), Date (chr)
> 
> # Print out a selection of columns of c2
> select(c2, Date, DepTime, ArrTime, TailNum)
Source: local data frame [72 x 4]

        Date DepTime ArrTime TailNum
1  2011-2-26    1538    1951  N292JB
2  2011-8-17     623    1116  N516JB
3  2011-9-23     645    1046  N627JB
4  2011-2-23    1546    2002  N318JB
5  2011-7-25    1759    2307  N657JB
6  2011-8-11     627    1040  N639JB
7   2011-8-5     627    1058  N635JB
8  2011-6-20     656    1108  N247JB
9  2011-1-12    1532    1953  N504JB
10  2011-4-8    1526    1937  N316JB
..       ...     ...     ...     ...
# How many weekend flights flew a distance of more than 1000 miles but had a total taxiing time below 15 minutes?
> select(filter(hflights, DayOfWeek %in% c(6,7) & Distance > 1000 & (TaxiIn + TaxiOut)< 15), Distance, TaxiIn,TaxiOut)
Source: local data frame [155 x 3]

   Distance TaxiIn TaxiOut
1      1235      5       7
2      1020      2       6
3      1303      2      12
4      1312      3       6
5      1246      4       7
6      1195      6       7
7      1235      4       7
8      1530      6       8
9      1235      3      10
10     1043      6       8
..      ...    ...     ...
> # Definition of dtc
> dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))
> 
> # Arrange dtc by departure delays so that the shortest departure delay is at the top of the data set
> arrange(dtc, DepDelay)
Source: local data frame [7 x 21]

  Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1 2011     9         29         4    1639      NA            OO      2062
2 2011     4         30         6     612      NA            EV      5386
3 2011    12         25         7    1652      NA            XE      4375
4 2011    12         11         7    1303      NA            UA       399
5 2011     1         26         3    1703      NA            CO       410
6 2011     1         26         3    1926      NA            CO       310
7 2011     9         16         5    1234      NA            OO      2059
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # Arrange dtc so that cancellation reasons are grouped
> arrange(dtc, CancellationCode)
Source: local data frame [7 x 21]

  Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1 2011    12         11         7    1303      NA            UA       399
2 2011     4         30         6     612      NA            EV      5386
3 2011    12         25         7    1652      NA            XE      4375
4 2011     9         16         5    1234      NA            OO      2059
5 2011     1         26         3    1926      NA            CO       310
6 2011     9         29         4    1639      NA            OO      2062
7 2011     1         26         3    1703      NA            CO       410
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # Arrange dtc according to carrier and departure delays
> arrange(dtc, UniqueCarrier, DepDelay)
Source: local data frame [7 x 21]

  Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1 2011     1         26         3    1703      NA            CO       410
2 2011     1         26         3    1926      NA            CO       310
3 2011     4         30         6     612      NA            EV      5386
4 2011     9         29         4    1639      NA            OO      2062
5 2011     9         16         5    1234      NA            OO      2059
6 2011    12         11         7    1303      NA            UA       399
7 2011    12         25         7    1652      NA            XE      4375
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # Arrange according to carrier and decreasing departure delays
> arrange(hflights, UniqueCarrier, desc(DepDelay))
Source: local data frame [22,751 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011     5         12         4    2158    2301            AA       426
2  2011    11         15         2    1742    1858            AA       458
3  2011    12         17         6    1805    1912            AA      1033
4  2011     6         21         2    1050    1148            AA      1995
5  2011     6         22         3    1317    1501            AA      1534
6  2011    10         13         4    1138    1454            AA      1946
7  2011     3          9         3    2137    2240            AA       653
8  2011     5         20         5    1020    1136            AA      2002
9  2011     3         24         4     821     924            AA      1225
10 2011     5          2         1    2116    2224            AA       426
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # Arrange flights by total delay (normal order).
> arrange(hflights, DepDelay + ArrDelay)
Source: local data frame [22,751 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011    12         25         7     741     926            OO      4591
2  2011    12          4         7     759    1014            XE      4455
3  2011     9         10         6    1830    2036            AS       731
4  2011     3          8         2    1037    1437            CO       432
5  2011    10          2         7     939    1055            OO      5166
6  2011     2         14         1    1917    2027            MQ      3328
7  2011     1          3         1    1056    1303            MQ      3796
8  2011     1         11         2    1005    1326            CO      1544
9  2011     9         29         4     938    1057            OO      2040
10 2011     9          2         5    1855    2238            CO      1661
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
> # Keep flights leaving to DFW before 8am and arrange according to decreasing AirTime 
> arrange(filter(hflights, Dest == "DFW", DepTime < 800), desc(AirTime) )
Source: local data frame [77 x 21]

   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1  2011    12         13         2     706     824            MQ      3328
2  2011    12          9         5     724     854            CO      1206
3  2011    12         14         3     732     849            XE      4326
4  2011     1         21         5     746     904            OO      1117
5  2011    11          2         3     556     724            MQ      3265
6  2011     8          5         5     559     711            MQ      3265
7  2011    12         13         2     729     845            XE      4326
8  2011     7          1         5     557     703            MQ      3265
9  2011     7          5         2     557     701            MQ      3265
10 2011    11         24         4     601     715            MQ      3265
..  ...   ...        ...       ...     ...     ...           ...       ...
Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime (int),
  ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr), Distance (int),
  TaxiIn (int), TaxiOut (int), Cancelled (int), CancellationCode (chr),
  Diverted (int)
> 
# print a list containing only TailNum of flights that departed too late, sorted by total taxiing time
> select(arrange(filter(hflights, DepDelay > 30), TaxiIn + TaxiOut),TailNum)
Source: local data frame [2,152 x 1]

   TailNum
1   N747SA
2   N630WN
3   N912WN
4   N255WN
5   N711HK
6   N353SW
7   N326SW
8   N636WN
9   N751SW
10  N9222N
..     ...
> # Print out a summary with variables min_dist and max_dist
> summarise(hflights, min_dist = min(Distance), max_dist = max(Distance))
Source: local data frame [1 x 2]

  min_dist max_dist
1       79     3904
> 
> # Print out a summary with variable max_div : the longest Distance for diverted flights
> summarise(filter(hflights, Diverted == 1), max_div = max(Distance))
Source: local data frame [1 x 1]

  max_div
1    1428
> temp1 = filter(hflights, !is.na(ArrDelay))
> summarise(temp1, earliest = min(ArrDelay), 
    average = mean(ArrDelay), latest = max(ArrDelay), sd = sd(ArrDelay))
Source: local data frame [1 x 4]

  earliest  average latest       sd
1      -57 6.836407    704 30.08375
> temp2 = filter(hflights, !is.na(TaxiIn) & !is.na(TaxiOut))
> summarise(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
Source: local data frame [1 x 1]

  max_taxi_diff
1           141
> # hflights is available
> 
> # Remove rows that have NA ArrDelay: temp1
> temp1 = filter(hflights, !is.na(ArrDelay))
> 
> # Generate summary about ArrDelay column of temp1
> summarise(temp1, earliest = min(ArrDelay), 
    average = mean(ArrDelay), latest = max(ArrDelay), sd = sd(ArrDelay))
Source: local data frame [1 x 4]

  earliest  average latest       sd
1      -57 6.836407    704 30.08375
> 
> # Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2
> temp2 = filter(hflights, !is.na(TaxiIn) & !is.na(TaxiOut))
> 
> 
> # Print the maximum taxiing difference of temp2 with summarise()
> summarise(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
Source: local data frame [1 x 1]

  max_taxi_diff
1           141
> # Generate summarizing statistics for hflights
> summarise(hflights, n_obs = length(Year), 
    n_carrier = length(unique(UniqueCarrier)),
    n_dest = length(unique(Dest)),
    dest100 = Dest[100])
Source: local data frame [1 x 4]

  n_obs n_carrier n_dest dest100
1 22751        15    115     EWR
> 
> # Filter hflights to keep all American Airline flights: aa
> aa = filter(hflights, UniqueCarrier == "American")
> 
> # Generate summarizing statistics for aa 
> summarise(aa, n_flights = length(Year),
    n_canc = length(which(Cancelled==1)),
    p_canc = length(which(Cancelled==1))/length(Cancelled) * 100,
    avg_delay = mean(ArrDelay, na.rm = TRUE))
Source: local data frame [1 x 4]

  n_flights n_canc   p_canc avg_delay
1       315      5 1.587302 -0.116129
> # Generate summarizing statistics for hflights
> summarise(hflights, n_obs = length(Year), 
    n_carrier = length(unique(UniqueCarrier)),
    n_dest = length(unique(Dest)),
    dest100 = Dest[100])
Source: local data frame [1 x 4]

  n_obs n_carrier n_dest dest100
1 22751        15    115     EWR
> 
> # Filter hflights to keep all American Airline flights: aa
> aa = filter(hflights, UniqueCarrier == "American")
> 
> # Generate summarizing statistics for aa 
> summarise(aa, n_flights = length(Year),
    n_canc = length(which(Cancelled==1)),
    p_canc = round(length(which(Cancelled==1))/length(Cancelled) * 100,2),
    avg_delay = round(mean(ArrDelay, na.rm = TRUE),2))
Source: local data frame [1 x 4]

  n_flights n_canc p_canc avg_delay
1       315      5   1.59     -0.12
> # Solve the exercise using a combination of dplyr verbs and %>%
> hflights %>%
    filter(Distance / (ActualElapsedTime + 100) * 60 < 105 | Cancelled == 1 | Diverted == 1) %>%
    summarise( n_non = n(), 
             p_non = n()/nrow(hflights) * 100 ,
             n_dest = n_distinct(Dest), 
             min_dist = min(Distance), 
             max_dist = max(Distance))
Source: local data frame [1 x 5]

  n_non   p_non n_dest min_dist max_dist
1  4294 18.8739     84       79     2007
> # Count the number of overnight flights
> hflights %>%
    filter(!is.na(DepTime) & !is.na(ArrTime) & ArrTime < DepTime) %>%
    summarise(n = n())
Source: local data frame [1 x 1]

    n
1 265

Generate a per-carrier summary of hflights with the following variables: n_flights, the number of flights flown by the carrier; n_canc, the number of cancelled flights; p_canc, the percentage of cancelled flights; avg_delay, the average arrival delay of flights whose delay does not equal NA. Next, order the carriers in the summary from low to high by their average arrival delay. Use percentage of flights cancelled to break any ties. Which airline scores best based on these statistics?

Generate a per-day-of-week summary of hflights with the variable avg_taxi, the average total taxiing time. Pipe this summary into an arrange() call such that the day with the highest avg_taxi comes first.

> # Make an ordered per-carrier summary of hflights
> hflights %>%
    group_by(UniqueCarrier) %>%
    summarise(n_flights = n(),
              n_canc = length(which(Cancelled==1)),
              p_canc = n_canc / n_flights * 100,
              avg_delay = mean(ArrDelay, na.rm=TRUE)) %>%
    arrange(avg_delay)
Source: local data frame [15 x 5]

        UniqueCarrier n_flights n_canc     p_canc  avg_delay
1          US_Airways       422      5  1.1848341 -1.0721154
2                Mesa         7      1 14.2857143 -0.8333333
3            American       315      5  1.5873016 -0.1161290
4             AirTran       225      2  0.8888889  0.7117117
5              Alaska        32      0  0.0000000  1.0625000
6            Frontier        79      0  0.0000000  4.0000000
7             JetBlue        72      3  4.1666667  4.0000000
8         Continental      6978     50  0.7165377  6.0759329
9  Atlantic_Southeast       229      7  3.0567686  6.2927928
10             United       194      5  2.5773196  6.5925926
11              Delta       241      5  2.0746888  7.1440678
12          Southwest      4572     74  1.6185477  7.5294903
13         ExpressJet      7325    112  1.5290102  7.6131367
14     American_Eagle       441      8  1.8140590  7.8101852
15            SkyWest      1619     26  1.6059296  9.1620429
> 
> # Make an ordered per-day summary of hflights
> hflights %>%
    group_by(DayOfWeek) %>%
    summarise(avg_taxi = mean(TaxiIn + TaxiOut, na.rm = TRUE)) %>%
    arrange(desc(avg_taxi))
Source: local data frame [7 x 2]

  DayOfWeek avg_taxi
1         1 21.61392
2         5 21.39562
3         2 21.28001
4         3 21.23543
5         4 21.16079
6         7 20.78389
7         6 20.68192
> # Solution to first instruction
> hflights %>%
    filter(!is.na(ArrDelay)) %>%
    group_by(UniqueCarrier) %>%
    summarise(p_delay = length(which(ArrDelay > 0)) / n()) %>%
    mutate(rank = rank(p_delay)) %>%
    arrange(rank)
Source: local data frame [15 x 3]

        UniqueCarrier   p_delay rank
1             AirTran 0.2972973    1
2              Alaska 0.3125000    2
3          US_Airways 0.3149038    3
4             JetBlue 0.3235294    4
5            American 0.3258065    5
6                Mesa 0.3333333    6
7  Atlantic_Southeast 0.3693694    7
8      American_Eagle 0.3750000    8
9               Delta 0.3983051    9
10             United 0.4550265   10
11          Southwest 0.4645003   11
12         ExpressJet 0.4867798   12
13        Continental 0.4937807   13
14           Frontier 0.5063291   14
15            SkyWest 0.5422446   15
> 
> # Solution to second instruction
> hflights %>%
    filter(!is.na(ArrDelay) & ArrDelay > 0) %>%
    group_by(UniqueCarrier) %>%
    summarise(avg = mean(ArrDelay)) %>%
    mutate(rank = rank(avg)) %>%
    arrange(rank)
Source: local data frame [15 x 3]

        UniqueCarrier      avg rank
1            Frontier 13.42500    1
2                Mesa 14.00000    2
3          US_Airways 20.19084    3
4         Continental 21.83245    4
5            American 23.07921    5
6          ExpressJet 23.45569    6
7             SkyWest 24.31395    7
8           Southwest 25.18831    8
9             AirTran 26.28788    9
10             United 28.03488   10
11             Alaska 28.30000   11
12              Delta 33.97872   12
13            JetBlue 36.63636   13
14     American_Eagle 39.07407   14
15 Atlantic_Southeast 39.23171   15
> # Which plane (by tail number) flew out of Houston the most times? How many times? adv1
> adv1 <- hflights %>%
          group_by(TailNum) %>%
          summarise(n = n()) %>%
          filter(n == max(n))
> adv1
Source: local data frame [1 x 2]

  TailNum   n
1  N16944 115
> # How many airplanes only flew to one destination from Houston? adv2
> adv2 <- hflights %>%
          group_by(TailNum) %>%
          summarise(ndest = n_distinct(Dest)) %>%
          filter(ndest == 1) %>%
          summarise(nplanes = n())
> adv2
Source: local data frame [1 x 1]

  nplanes
1    1039
> 
> # Find the most visited destination for each carrier: adv3
> adv3 <- hflights %>% 
          group_by(UniqueCarrier, Dest) %>%
          summarise(n = n()) %>%
          mutate(rank = rank(desc(n))) %>%
          filter(rank == 1)
> adv3
Source: local data frame [15 x 4]
Groups: UniqueCarrier

        UniqueCarrier Dest   n rank
1             AirTran  ATL 211    1
2              Alaska  SEA  32    1
3            American  DFW 186    1
4      American_Eagle  DFW 234    1
5  Atlantic_Southeast  DTW  89    1
6         Continental  LAX 417    1
7               Delta  ATL 219    1
8          ExpressJet  CRP 313    1
9            Frontier  DEN  78    1
10            JetBlue  JFK  72    1
11               Mesa  CLT   6    1
12            SkyWest  COS 143    1
13          Southwest  DAL 839    1
14         US_Airways  CLT 237    1
15             United  ORD  59    1
> # Find the carrier that travels to each destination the most: adv4
> adv4 <- hflights %>% 
          group_by(Dest, UniqueCarrier) %>%
          summarise(n = n()) %>%
          mutate(rank = rank(desc(n))) %>%
          filter(rank == 1)
> adv4
Source: local data frame [115 x 4]
Groups: Dest

   Dest UniqueCarrier   n rank
1   ABQ    ExpressJet 109    1
2   AEX    ExpressJet  76    1
3   AMA    ExpressJet 132    1
4   ANC   Continental  18    1
5   ASE       SkyWest   9    1
6   ATL         Delta 219    1
7   AUS   Continental 274    1
8   AVL    ExpressJet  29    1
9   BFL       SkyWest  43    1
10  BHM    ExpressJet 193    1
..  ...           ... ...  ...
> # set up a src that connects to the mysql database (src_mysql is provided by dplyr)
> my_db <- src_mysql(dbname = "dplyr", 
                  host = "dplyr.csrrinzqubik.us-east-1.rds.amazonaws.com", 
                  port = 3306, 
                  user = "dplyr",
                  password = "dplyr")
> 
> # and reference a table within that src: nycflights is now available as an R object that references to the remote nycflights table
> nycflights <- tbl(my_db, "dplyr")
> 
> # glimpse at nycflights
> glimpse(nycflights)
Observations: 336776
Variables:
$ id        (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
$ year      (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201...
$ month     (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ day       (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ dep_time  (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 55...
$ dep_delay (int) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, ...
$ arr_time  (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 8...
$ arr_delay (int) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14,...
$ carrier   (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA...
$ tailnum   (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463...
$ flight    (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49,...
$ origin    (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "...
$ dest      (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "...
$ air_time  (int) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158...
$ distance  (int) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 10...
$ hour      (int) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, ...
$ minute    (int) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, 58, 5...
> 
> 
> # Calculate the grouped summaries detailed in the instructions
>   nycflights %>%
    group_by(carrier) %>%
    summarise(n_flights = n(), avg_delay = mean(arr_delay)) %>%
    arrange(avg_delay)
Source: mysql 5.6.21-log [dplyr@dplyr.csrrinzqubik.us-east-1.rds.amazonaws.com:/dplyr]
From: <derived table> [?? x 3]
Arrange: avg_delay 
Warning message: Decimal MySQL column 2 imported as numeric
   carrier n_flights avg_delay
1       AS       714   -9.8613
2       HA       342   -6.9152
3       AA     32729    0.3556
4       DL     48110    1.6289
5       VX      5162    1.7487
6       US     20536    2.0565
7       UA     58665    3.5045
8       9E     18460    6.9135
9       B6     54635    9.3565
10      WN     12275    9.4675
..     ...       ...       ...