Thursday, September 17, 2020

tidyverse, ggplot, FAQ

 

# Count the number of full duplicates

sum(duplicated(bike_share_rides))


# Remove duplicates

bike_share_rides_unique <- distinct(bike_share_rides)


# Count the full duplicates in bike_share_rides_unique

sum(duplicated(bike_share_rides_unique))


# Find duplicated ride_ids

bike_share_rides %>% 

  count(ride_id) %>% 

  filter(n > 1)


# Remove full and partial duplicates

bike_share_rides_unique <- bike_share_rides %>%

  # Only based on ride_id instead of all cols

  distinct(ride_id, .keep_all = TRUE)


bike_share_rides %>%

  # Group by ride_id and date

  group_by(ride_id, date) %>%

  # Add duration_min_avg column

  mutate(duration_min_avg = mean(duration_min)) %>%

  # Remove duplicates based on ride_id and date, keep all cols

  distinct(ride_id, date, .keep_all = TRUE) %>%

  # Remove duration_min column

  select(-duration_min)


# Find bad dest_size rows

sfo_survey %>% 

  # Join with dest_sizes data frame to get bad dest_size rows

  anti_join(dest_sizes, by = "dest_size") %>%

  # Select id, airline, destination, and dest_size cols

  select(id, airline, destination, dest_size)

# Add new columns to sfo_survey

sfo_survey <- sfo_survey %>%

  # dest_size_trimmed: dest_size without whitespace

  mutate(dest_size_trimmed = str_trim(dest_size),

         # cleanliness_lower: cleanliness converted to lowercase

         cleanliness_lower = str_to_lower(cleanliness))


# Count values of dest_size_trimmed

sfo_survey %>%

  count(dest_size_trimmed)


# Count values of cleanliness_lower

sfo_survey %>%

  count(cleanliness_lower)


# Count categories of dest_region

sfo_survey %>%

  count(dest_region)


# Categories to map to Europe

europe_categories <- c("EU", "eur", "Europ")


# Add a new col dest_region_collapsed

sfo_survey %>%

  # Map all categories in europe_categories to Europe

  mutate(dest_region_collapsed = fct_collapse(dest_region, 

                                              Europe = europe_categories)) %>%

  # Count categories of dest_region_collapsed

  count(dest_region_collapsed)


sfo_survey %>%

  filter(str_detect(phone, "-"))





No comments:

Post a Comment