# Count the number of full duplicates
sum(duplicated(bike_share_rides))
# Remove duplicates
bike_share_rides_unique <- distinct(bike_share_rides)
# Count the full duplicates in bike_share_rides_unique
sum(duplicated(bike_share_rides_unique))
# Find duplicated ride_ids
bike_share_rides %>%
count(ride_id) %>%
filter(n > 1)
# Remove full and partial duplicates
bike_share_rides_unique <- bike_share_rides %>%
# Only based on ride_id instead of all cols
distinct(ride_id, .keep_all = TRUE)
bike_share_rides %>%
# Group by ride_id and date
group_by(ride_id, date) %>%
# Add duration_min_avg column
mutate(duration_min_avg = mean(duration_min)) %>%
# Remove duplicates based on ride_id and date, keep all cols
distinct(ride_id, date, .keep_all = TRUE) %>%
# Remove duration_min column
select(-duration_min)
# Find bad dest_size rows
sfo_survey %>%
# Join with dest_sizes data frame to get bad dest_size rows
anti_join(dest_sizes, by = "dest_size") %>%
# Select id, airline, destination, and dest_size cols
select(id, airline, destination, dest_size)
# Add new columns to sfo_survey
sfo_survey <- sfo_survey %>%
# dest_size_trimmed: dest_size without whitespace
mutate(dest_size_trimmed = str_trim(dest_size),
# cleanliness_lower: cleanliness converted to lowercase
cleanliness_lower = str_to_lower(cleanliness))
# Count values of dest_size_trimmed
sfo_survey %>%
count(dest_size_trimmed)
# Count values of cleanliness_lower
sfo_survey %>%
count(cleanliness_lower)
# Count categories of dest_region
sfo_survey %>%
count(dest_region)
# Categories to map to Europe
europe_categories <- c("EU", "eur", "Europ")
# Add a new col dest_region_collapsed
sfo_survey %>%
# Map all categories in europe_categories to Europe
mutate(dest_region_collapsed = fct_collapse(dest_region,
Europe = europe_categories)) %>%
# Count categories of dest_region_collapsed
count(dest_region_collapsed)
sfo_survey %>%
filter(str_detect(phone, "-"))
No comments:
Post a Comment