## Creating the US student data barplot (but better) from Collaborative Stats. # First, read in the file (you'll need to have it saved and enter the appropriate location): college.data=read.csv("/Users/matt/Documents/STATS/collegeboard.csv", header=TRUE) # "Attach" the data. This lets us refer to the variables by name without specifying the data frame from which they come. attach(college.data) # Let's look at it: college.data Race.Ethnicity Overall.Student.Population[2] # Let's make a bar chart. The basis is pretty straightforward... barplot(Overall.Student.Population) # ...but the details can get messy. And the details are important. # "main" is the command to add a title, "ylab" labels the y-axis, "xlab" labels the x-axis). barplot(Overall.Student.Population, main="Race/Ethnicity in U.S. Public Schools", ylab="Percentage of Students", xlab="Race/Ethnicity") # If we want anyone to know what's happening we should probably label the bars. This information is already encoded in college.data; names.arg is the command to send it where we want it: barplot(Overall.Student.Population, main="Race/Ethnicity in U.S. Public Schools", ylab="Percentage of Students", xlab="Race/Ethnicity", names.arg=Race.Ethnicity) # Hmmm.... so close. "cex.names" will shrink the font of the labels. barplot(Overall.Student.Population, main="Race/Ethnicity in U.S. Public Schools", ylab="Percentage of Students", xlab="Race/Ethnicity", names.arg=Race.Ethnicity, cex.names=0.4) # Let's re-order the bars in order of size (as the data does not have any other more natural order. ("rev" reverses the list so that the largest comes first). o = rev(order(Overall.Student.Population)) barplot(Overall.Student.Population[o], main="Race/Ethnicity in U.S. Public Schools", ylab="Percentage of Students", xlab="Race/Ethnicity", names.arg=Race.Ethnicity[o], cex.names=0.4) # Actually, Not reported/other might be best at the end. Let's fudge the ordering manually: o2= c(5, 3, 2, 1, 4, 6) barplot(Overall.Student.Population[o2], main="Race/Ethnicity in U.S. Public Schools", ylab="Percentage of Students", xlab="Race/Ethnicity", names.arg=Race.Ethnicity[o2], cex.names=0.4) # The graph is not cluttered; we have more data. Given that the interesting bit about the data is the comparison between the overall population and those who take AP classes, can we put it all on one picture to compare more easily? Yes. Basics first ("rbind" combines our two lists): barplot(rbind(Overall.Student.Population,AP.Examinee.Population), beside=TRUE) # Bells and whistles back in, just as before: barplot(rbind(Overall.Student.Population[o2],AP.Examinee.Population[o2]), beside=TRUE, main="Race/Ethnicity of the Student Population and AP Candidates", ylab="Percentage of Students", xlab="Race/Ethnicity", names.arg=Race.Ethnicity[o2], cex.names=0.4) # And now we need a legend as well: barplot(rbind(Overall.Student.Population[o2],AP.Examinee.Population[o2]), beside=TRUE, main="Race/Ethnicity of the Student Population and AP Candidates", ylab="Percentage of Students", xlab="Race/Ethnicity", names.arg=Race.Ethnicity[o2], cex.names=0.4, col=c("blue","red"),legend = c("Student Population","AP Candidates")) # Woohoo! # Going back to the original plot with the single data set... barplot(Overall.Student.Population[o2], main="Race/Ethnicity in U.S. Public Schools", ylab="Percentage of Students", xlab="Race/Ethnicity", names.arg=Race.Ethnicity[o2], cex.names=0.4) #... how else might we represent it? A pie chart: pie(Overall.Student.Population) # Add the bits and bobs: pie(Overall.Student.Population, labels=Race.Ethnicity, main="Race/Ethnicity in U.S. Public Schools") # Rule of thumb: pie charts are a bit rubbish. # A dot plot: dotchart(Overall.Student.Population) # Again, we need some labels: dotchart(Overall.Student.Population, labels=Race.Ethnicity, main="Race/Ethnicity in U.S. Public Schools", xlab="Percentage") # Sort as before (the rev reverses o2 (again!) to put the largest at the top): dotchart(Overall.Student.Population[rev(o2)], labels=Race.Ethnicity[rev(o2)], main="Race/Ethnicity in U.S. Public Schools", xlab="Percentage")