Looking at the outliers in the crowdsourced postcodes

Posted on 07 Sep 2013

A few weeks ago, I published some data on the accuracy of crowdsourced postcode data.

This weekend had a brief quiet moment, so I thought I'd try out a few visualisations to see if there was any particular grouping around the inaccuracies. Doesn't seem to be anything systematic.

The chart shows the npe sourced data, the OS sourced data and an arrow between them, coloured according to its magnitude.

In [271]:
import csv
from multiprocessing import Pool
import itertools

distances = []
arrow_loc = []
arrow_vec = []
dist = []

# minimum distance difference we care about
min_dist = 1000

def get_vector(row):
    return [row[2][0] - row[1][0], row[2][1] - row[1][1]]

with open('distances.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        new_row = (row[0], map(int, row[1:3]), map(int, row[4:6]), float(row[6]))
        vector = get_vector(new_row)
        if new_row[3] > min_dist:
            arrow_loc.append(new_row[1])
            arrow_vec.append(vector)
            dist.append(new_row[3])
        distances.append(new_row + (vector,))
    csvfile.close()

distances = filter(lambda x: x[3] > min_dist, distances)

limit = len(distances)
distances = distances[:limit]
arrow_loc = arrow_loc[:limit]
arrow_vec = arrow_vec[:limit]
dist = dist[:limit]

o = zip(*zip(*distances)[2])
In [272]:
import matplotlib
import matplotlib.pylab as pylab
from matplotlib import cm
import numpy as np
import math
import matplotlib.pyplot as plt

%matplotlib inline

a = zip(*arrow_loc)
b = zip(*arrow_vec)
c = []

dist /= np.max(np.abs(dist),axis=0)
def norm(v, min, max):
    return np.abs(v) * (max - min) + min

for i in dist:
    #c.append(norm(i, 0.5,1))
    #c.append(scalarMap.to_rgba(i))
    c.append(norm(i, 0.5,1))

#heatmap, xedges, yedges = np.histogram2d(a[0], a[1], bins=50)
#extent = [np.min(xedges), np.min(xedges), yedges[0], yedges[-1]]
xlim = 700000
ylim = 1200000
width = 20

pylab.rcParams['figure.figsize'] = width, ylim/xlim * width

plt.xlim(0,xlim)
plt.ylim(0,ylim)


#plt.imshow(heatmap, interpolation='bicubic')
plt.scatter(a[0], a[1], color=(0.9,0.9,0.9))
plt.scatter(o[0], o[1], color=(0.6,0.7,0.6))
plt.quiver(a[0], a[1], b[0], b[1], c, width=0.001, cmap='Blues',angles='xy',scale_units='xy',scale=1)


#plt.ylabel('northing (m)')
#plt.xlabel('easting (m)')
map of differneces

StackOverflow Flair

profile for Simon Elliston Ball at Stack Overflow, Q&A for professional and enthusiast programmers