Percentile Bands
for Unemployment Data in 380 U.S. Metro Areas
Show Mean Min-Max 1%-99% 5%-95% 10%-90% 15%-85% 20%-80% 25%-75% 33%-66%
Range:
Metro Area:
path.msa {stroke: rgba(128, 128, 128, .3);stroke-width: 1px;fill: none;}path.msa:hover {stroke: blue;stroke-width: 2px;}path.active {stroke: blue;stroke-width: 2px;}.percentile {stroke: none; /* rgba(128, 160, 128, .75);*/fill: rgba(128, 160, 128, .5);pointer-events: none;}path.mean {stroke: black;stroke-width: 2px;fill: none;}path.mean.hidden {visibility: hidden;}div#container {background-color: white;width: 920px;height: 630px;display: none;}div#container h2 {margin-left: .5em;margin-top: 5px;margin-bottom: .3em;display: inline-block;}div#vis {margin-left: 10px;margin-bottom: .5em;width: 900px;height: 500px;}div#nav {font-family: sans-serif;margin-left: 5px;float: right;width: 400px;}label {display: inline-block;width: 95px;margin-bottom: 0px;}input[type="radio"], input[type="checkbox"] {margin-top: 0px;}div#info {font-family: sans-serif;margin-left: 1em;}path.dropline {fill: none;stroke: gray;stroke-width: .5px;stroke-dasharray: 2,3;visibility: hidden;}path.visible {visibility: visible;}.axis path, .axis line {stroke: gray;stroke-width: 1px;fill: none;shape-rendering: crispEdges;}.axis text {font-family: sans-serif;color: gray;font-size: 10px;}var unemployment;var percentiles;var percentOffset = 2;var showMean = true;var width = 900;var height = 500;var leftMargin = 20;var bottomMargin = 20;var foreground;var background;var timeScale;var valueScale;// for IE9, from https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/forEachif (!Array.prototype.forEach) { Array.prototype.forEach = function (fn, scope) { 'use strict'; var i, len; for (i = 0, len = this.length; i < len; ++i) { if (i in this) { fn.call(scope, this[i], i, this); } } };}function drawPercentileBand() {foreground.selectAll('.percentile').remove();if (percentOffset >= 0) {var area = d3.svg.area().x(function(d) { return timeScale(d.date); }).y0(function(d) { return valueScale(d.percentiles[percentOffset]); }).y1(function(d) { return valueScale(d.percentiles[15-percentOffset]); });foreground.append('path').datum(percentiles).attr('class', 'percentile').attr('d', area);}}function switchPercentile(newOffset) {percentOffset = newOffset;drawPercentileBand();}function makeChart() {var svg = d3.select('#vis').append('svg').attr('width', width).attr('height', height);timeScale = d3.time.scale().domain([new Date('1/1/2003'), new Date('6/1/2013')]).range([leftMargin, width]);valueScale = d3.scale.linear().domain([0, 32]).range([height-bottomMargin, 0]);var line = d3.svg.line().x(function(d, i) { return timeScale(unemployment.dates[i]); }).y(function(d, i) { return valueScale(d)});background = svg.append('g');foreground = svg.append('g');drawPercentileBand();foreground.append('g').attr('class', 'axis').attr('transform', 'translate('+leftMargin+',0)').call(d3.svg.axis().scale(valueScale).orient('left').tickSubdivide(4));foreground.append('g').attr('class', 'axis').attr('transform', 'translate(0,'+(height-bottomMargin)+')').call(d3.svg.axis().scale(timeScale).orient('bottom').tickSubdivide(11).tickSize(7, 3, 0));var dropLineLeft = foreground.append('path').attr('class', 'dropline');var dropLineDown = foreground.append('path').attr('class', 'dropline');var dateSpan = unemployment.dates[unemployment.dates.length-1]-unemployment.dates[0];unemployment.values.forEach(function(msa) {background.append('path').attr('d', line(msa.rates)).attr('class', 'msa '+msa.laus).on('mouseover', function(event) {var mouseCoords = d3.mouse(svg.node());mouseCoords[0] -= leftMargin;//tooltip.attr('x', mouseCoords[0]).attr('y', mouseCoords[1]-15).classed('visible', true);dropLineLeft.attr('d', 'M '+(mouseCoords[0]+leftMargin)+' '+mouseCoords[1]+' L 0 '+mouseCoords[1]).classed('visible', true);dropLineDown.attr('d', 'M '+(mouseCoords[0]+leftMargin)+' '+mouseCoords[1]+' L '+(mouseCoords[0]+leftMargin)+' '+height).classed('visible', true);var valIndex = d3.bisect(unemployment.dates, new Date(unemployment.dates[0].getTime()+dateSpan*mouseCoords[0]/(width-leftMargin)))-1;d3.select('#city').html('<b>'+msa.city+'</b>');var date = unemployment.dates[valIndex];d3.select('#values').html(('0'+(date.getMonth()+1)).slice(-2)+'/'+date.getFullYear()+': <b>'+msa.rates[valIndex].toFixed(1)+'%</b>');}).on('mouseout', function(event) {dropLineLeft.classed('visible', false);dropLineDown.classed('visible', false);})// .append('title')// .text(msa.laus);});var meanLine = d3.svg.line().x(function(d) { return timeScale(d.date); }).y(function(d) { return valueScale(d.mean)});background.append('path').attr('d', meanLine(percentiles)).attr('class', 'mean').append('title').text('Mean');}function toggle(lauslist) {lauslist.forEach(function(laus) {var laus = background.select(laus);laus.classed('active', !laus.classed('active'));});}function toggleMean() {showMean = !showMean;d3.select('.mean').classed('hidden', !showMean);}function switchToPercentile(p) {d3.select('#p'+p).property('checked', true);percentOffset = p;drawPercentileBand();}d3.select('#container').style('display', 'block');queue().defer(d3.json, '/wp-content/uploads/2013/07/unemployment.json').defer(d3.json, '/wp-content/uploads/2013/07/Unemployment-Percentiles.json').await(function(error, unempData, percData) {unempData.dates.forEach(function(d, i, dates) {dates[i] = new Date(d);});unemployment = unempData;percData.forEach(function(d) {d.date = new Date(d.date);});percentiles = percData;makeChart();});
This article contains a visualization that will not show up in newsreaders (or if you have JavaScript turned off). Click the image above to go to the website, and/or turn on JavaScript, so you can interact.
The visualization above shows the unemployment rate in 380 metro areas in the U.S. from January 2003 to June 2013 (data from the Bureau of Labor Statistics). Each of these is itself an average, but the overall mean is also shown as a heavier line. Mouse over to see individual metro areas highlighted.
As you explore, you will see many small and large patterns that the average, or mean, completely misses. You can see some outliers with very high unemployment, Hurricane Katrina, seemingly random spikes, etc. (click these links to highlight them in the visualization above, click again to turn the highlight off). That is part of the function of the mean: it averages away small changes. That can be a desired effect, but it is often glossed over when numbers like unemployment rates are reported. Does a small change in the average taken over 200 million people really mean much? Worse yet, does no change mean that nothing happened?
How do you account for the large variation in this data, though? One way is to include a range based on percentiles. The most obvious would be to report the range from smallest to largest value. That does tend to be very sensitive to outliers, however, which may or may not be desirable. Instead, perhaps a narrower range should be reported that covers most of the data, with the extreme values treated separately. But which one?
Percentiles are one of the simplest ideas in statistics: sort the data values, then pick the ones you want depending on their location in that list (as a fraction of the length of the list). The value in the middle is the 50th percentile, also known as the median. The value one quarter of the way into the list is the 25th percentile, etc. Picking the range of values from the 25th to the 75th percentile selects half the data (dropping the bottom and top quarters); this is also called the interquartile range.
A common way of looking at data is to drop the top and bottom 5%, which leaves the range from 5% to 95% (clicking these links will change the settings of the visualization above). That removes quite a bit of the range, though. Is 1% to 99% better? How about the interquartile range? Talking about percentiles in the abstract is one thing, but seeing how much data, and how much of the range of values, that ignores, is quite another.
Calculating percentiles requires additional data. With unemployment data, there is some on metro areas, sectors, and a number of demographic values. In other cases, that data is often not easy to find or simply not available. But whenever possible, we need to demand more context than a single number. A simple mean without such context is meaningless.