chap 3 - visualizing data¶

from matplotlib import pyplot as plt

# a plot example based on scratch/visualization.py with a few extras

# For more example & docs, google for example "matplotlib docs",
# e.g. https://matplotlib.org/3.1.1/tutorials/index.html

years = [1950, 1960, 1970, 1980, 1990, 2000, 2010]
gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]

years2 = [1975, 1995]
other = [200, 400]

# init & set up size of plot
plt.figure(dpi=220, figsize=(3, 2))   # dots_per_inch and (width, height) in inches

# create a line chart, years on x-axis, gdp on y-axis
# (The 'label' is what get's put in the legend; see below.)
plt.plot(years, gdp, color='green', marker='o', linestyle='solid', label='GDP')

# add some other stuff on the same plot (a few red dots in random places)
plt.plot([1975, 1995], [8000, 20000], color='red', marker='.', linestyle='none', label='other')

# add a title and y axis text 
plt.title("Nominal GDP")
plt.ylabel("Billions of $")

# add a legend describing what's in the graph
# (google e.g. "matplotlib legend example" to find the docs)
plt.legend(loc='upper left') 

# save it to a file 
plt.savefig('gdp_plot.png')

# or display it here 
plt.show()

# and now a bar chart

plt.figure(dpi=220, figsize=(3, 2))   # init & set size of the new plot (3" x 2", 220dpi)

movies = [ "Annie Hall" , "Ben-Hur" , "Casablanca" , "Gandhi" , "West Side Story" ] 
num_oscars = [ 5 , 11 , 3 , 8 , 10 ] 

# plot bars with left x-coordinates [0, 1, 2, 3, 4], heights [num_oscars] 
plt.bar(range(len(movies)), num_oscars ) 

plt.title("My Favorite Movies", fontsize=9)          # add a title 
plt.ylabel("# of Academy Awards", fontsize=7)        # add y-axis label
plt.xticks(range(len(movies)), movies, fontsize=5)   # add x-axis labels
plt.yticks(fontsize=5)

# -- font sizes --
# I need to do some trial and error for the fontsizes.
# I found this way to do that by googling "matplotlib set font size",
# looking at several methods, ended up with the one at kite.com/python/answers/
# how-to-set-the-font-size-of-the-figure-title-and-axis-labels-in-a-matplotlib-graph-in-python

plt.show ()

from collections import Counter 

plt.figure(dpi=220, figsize=(3, 2))   # init & set size of the new plot (3" x 2", 220dpi)

grades = [ 83 , 95 , 91 , 87 , 70 , 0 , 85 , 82 , 100 , 67 , 73 , 77 , 0 ] 

# Bucket grades by decile, but put 100 in with the 90s 
histogram = Counter(min( grade // 10 * 10 , 90) for grade in grades)

plt.bar([x + 5 for x in histogram.keys ()], #   Shift bars right by 5 
        histogram.values(),                 # Give each bar its correct height 
        10 ,                                # Give each bar a width of 10 
        edgecolor = ( 0 , 0 , 0 ))          # Black edges for each bar 

plt.axis([ 5 , 105 , 0 , 5 ]) # x-axis from 5 to 105, # y-axis from 0 to 5 

plt.xticks([ 10 * i for i in range ( 11 )]) # x-axis labels at 0, 10, ..., 100 
plt.xlabel("Decile" ) 
plt.ylabel("# of Students" ) 
plt.title ( "Distribution of Exam 1 Grades" ) 

plt.show ()

# a misleading bar chart

plt.figure(dpi=220, figsize=(3, 2))   # init & set size of the new plot (3" x 2", 220dpi)

mentions = [500, 505]
years = [2017, 2018]

plt.bar(years, mentions, 0.8)
plt.xticks(years)
plt.ylabel("# of times 'data science' heard", fontsize='small')

# if you don't do this, matplotlib will label the x-axis 0, 1
# and then add a +2.013e3 off in the corner (bad matplotlib!)
plt.ticklabel_format(useOffset=False)

# misleading y-axis only shows the part above 500
plt.axis([2016.5, 2018.5, 499, 506])
plt.title("Look at the 'Huge' Increase!")

plt.show()

# ... and now now misleading .

plt.figure(dpi=220, figsize=(3, 2))   # init & set size of the new plot (3" x 2", 220dpi)

plt.bar(years, mentions, 0.8)
plt.xticks(years)
plt.ylabel("# of times 'data science' heard", fontsize='small')
plt.ticklabel_format(useOffset=False)

plt.axis([2016.5, 2018.5, 0, 550])
plt.title("Not So Huge Anymore")
# plt.show()

Text(0.5, 1.0, 'Not So Huge Anymore')

# scatterplot

plt.figure(dpi=220, figsize=(2, 2))

plt.xticks(fontsize=7)   # Size set by trial and error.
plt.yticks(fontsize=7)   # (The changes from the last graph are still in effect.)

test_1_grades = [ 99, 90, 85, 97, 80]
test_2_grades = [100, 85, 60, 90, 70]

plt.scatter(test_1_grades, test_2_grades)

plt.title("Axes Are Comparable")
plt.xlabel("test 1 grade")
plt.ylabel("test 2 grade")

plt.axis("equal")                     # set distance scale on the two axes the same

plt.show()

# axes are auto-scaled to different ranges ... misleading.

test_1_grades = [ 99, 90, 85, 97, 80]
test_2_grades = [100, 85, 60, 90, 70]

plt.figure(dpi=220, figsize=(2, 2))

plt.scatter(test_1_grades, test_2_grades)
plt.title("Axes Aren't Comparable")
plt.xlabel("test 1 grade")
plt.ylabel("test 2 grade")

plt.show()

# ... not misleading anymore.

plt.figure(dpi=220, figsize=(2, 2))

plt.scatter(test_1_grades, test_2_grades)
plt.title("Axes Are Comparable")
plt.axis("equal")                 # <=== force axes to have same scale
plt.xlabel("test 1 grade")
plt.ylabel("test 2 grade")

plt.show()