Daily Assignment 7
Data wrangling and plotting
- Using the built-in starwars data set in the dplyr package, create a new tibble with the following modifications:
- It only includes the species that have more than 1 individual represented in the data set. (hint: first use dplyr to summarize the number of individuals per species in the data frame; then create a vector/tibble of those names; subset the species that are included in that vector)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(starwars)
newtibble<-starwars %>%
group_by(species) %>%
summarize(TotalNumber=n(), .groups='drop')
starwars_speciesgr1<-filter(newtibble, TotalNumber > 1)
print(starwars_speciesgr1)## # A tibble: 9 × 2
## species TotalNumber
## <chr> <int>
## 1 Droid 6
## 2 Gungan 3
## 3 Human 35
## 4 Kaminoan 2
## 5 Mirialan 2
## 6 Twi'lek 2
## 7 Wookiee 2
## 8 Zabrak 2
## 9 <NA> 4
- It only includes the columns that are not of type list (hint: use glimpse to check)
glimpse(starwars_speciesgr1)## Rows: 9
## Columns: 2
## $ species <chr> "Droid", "Gungan", "Human", "Kaminoan", "Mirialan", "Twi'l…
## $ TotalNumber <int> 6, 3, 35, 2, 2, 2, 2, 2, 4
nottypelist<-filter(starwars, species %in% c("Droid", "Gungan", "Human", "Kaminoan", "Mirialan", "Twi'lek", "Wookiee", "Zabrak")) # subsetting characters with these names
print(nottypelist)## # A tibble: 54 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywa… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru White… 165 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 97 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dark… 183 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan Ke… 182 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 44 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
- Clean your data set: only include observations/rows that do not have an NA in the height column.
anyNA(nottypelist)## [1] TRUE
cleanstarwars_height<-nottypelist[complete.cases(nottypelist[,2]),] # removes NA in second column (height)
print(cleanstarwars_height)## # A tibble: 49 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywa… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru White… 165 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 97 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dark… 183 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan Ke… 182 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 39 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
- Then, use dplyr to print a summary table showing the mean, median, and standard deviation of height for each species
starwars_heightsummary<-cleanstarwars_height %>%
group_by(species) %>%
summarize(meanHeight=mean(height), medHeight=median(height), stdevHeight=sd(height), .groups='drop')
print(starwars_heightsummary)## # A tibble: 8 × 4
## species meanHeight medHeight stdevHeight
## <chr> <dbl> <dbl> <dbl>
## 1 Droid 131. 97 49.1
## 2 Gungan 209. 206 14.2
## 3 Human 177. 180 12.5
## 4 Kaminoan 221 221 11.3
## 5 Mirialan 168 168 2.83
## 6 Twi'lek 179 179 1.41
## 7 Wookiee 231 231 4.24
## 8 Zabrak 173 173 2.83
- Create a box plot figure that shows the variation in heights among Star Wars species. Do the values in your summary table line up with the plot? Code so that each box plot (Species) is be filled with a different color. Change your axes labels so that species is capitalized and height is capitalized and shows units (cm). Use the theme_minimal option. Then look into the annotate or geom_text functions and add the text “p=1.2E-0.5” to the top right corner.
library(ggplot2)
library(viridis)## Loading required package: viridisLite
cols<-viridis(8, option = "magma") # assign 8 different "magma" color options from viridis package to "cols" so I can call it to fill the species key colors
starwars_heightboxplot<-ggplot(data=cleanstarwars_height, aes(x=species,y=height, fill=species)) + # ggplot code structure
geom_boxplot() + # create a boxplot
scale_fill_manual(values=cols) + # fill scale colors with "magma" from viridis, as assigned above
xlab("Species") + # add "Species" as x axis label
ylab("Height (cm)") + # add "Height (cm)" as y axis label
theme_minimal(base_size=10) # add minimalist theme (no background) and set base font size as 10
print(starwars_heightboxplot)For an extra (optional) challenge: overlay the data points onto the box plots (hint: look into geom_jitter or geom_point). See if you can change the color of the points, depending on the gender or sex variables.
starwars_heightboxplot +
geom_jitter(aes(colour = gender)) # trying both geom_jitter and geom_pointstarwars_heightboxplot +
geom_point(aes(colour = sex))- Using the same or a different built-in data set, create at least one additional (and different kind of) plot. Experiment with different arguments, features, and colors.
- Comparing Sepal Length to Petal Length in Iris Species
i1<-ggplot(data=iris, mapping=aes(x=Sepal.Length, y=Petal.Length, color=Species, shape=Species)) +
geom_point(size=1) +
xlab("Sepal Length") +
ylab("Petal Length") +
labs(title="Comparing Sepal Length to Petal Length in Various Species of Irises") +
theme(plot.title = element_text(size=11))
i1- UCB Admission to Department By Sex
# Using UCBAdmissions data, which includes # of acceptances and rejections to depts by sex
class(UCBAdmissions)## [1] "table"
ftable2df<-function(mydata) {
ifelse(class(mydata) == "ftable",
mydata <- mydata, mydata <- ftable(mydata))
dfrows <- rev(expand.grid(rev(attr(mydata, "row.vars"))))
dfcols <- as.data.frame.matrix(mydata)
names(dfcols) <- do.call(
paste, c(rev(expand.grid(rev(attr(mydata, "col.vars")))), sep = "_"))
cbind(dfrows, dfcols)
} # not my code, but this allowed me to convert table data into a df!
ucb_table<-transform(ftable2df(ftable(UCBAdmissions, row.vars = 3:2)),
Applications = Admitted + Rejected)
ucb_admit_plot<-ggplot(data=ucb_table, aes(x=Dept, y=Admitted, fill=Gender)) +
geom_col(position=position_dodge()) + # creating a bar plot to compare # of admits based on sex
xlab("Department") + # add x axis label
ylab("# of Admits") +
labs(title="UCB Admission to Department By Sex") +
labs(fill='Sex') +
geom_text(aes(label=Admitted), vjust=-0.4, position=position_dodge(width=1), color="black", size=2) +
scale_fill_manual(values=c("darkcyan","limegreen"))- Effect of Chicken Diet on Growth Over Time
# Using ChickWeight, a data set which compares weight to age of chicks on different diets
cwavg <- ChickWeight %>%
group_by(Diet, Time) %>%
summarize(meanWeight=mean(weight))## `summarise()` has grouped output by 'Diet'. You can override using the
## `.groups` argument.
# large data set, so averaging weight per day
cwavg_scatter<-ggplot(data=cwavg) +
geom_point(aes(x = Time, y = meanWeight, shape = Diet, color = Diet)) +
xlab("Time") +
ylab("Weight") +
labs(title="Effect of Chicken Diet on Weight Gain Over Time")
print(cwavg_scatter)Finally, create a multi-panel figure with the graphs you created.
library(patchwork)
print(starwars_heightboxplot / i1) # division symbol puts plots over one anotherprint(ucb_admit_plot / cwavg_scatter)