prep hospitals accessibility.py

import numpy as np
import pandas as pd
import geopandas as gpd
import os
from scipy.spatial import cKDTree

############
#
# Globals & Functions
#
############
data_dir = ".\\data"

GBcrs = {"init":"epsg:27700"}

# OS Open Roads accessed from: https://www.ordnancesurvey.co.uk/opendatadownload/products.html
open_roads_dir = os.path.join(data_dir, "OS Open Roads")

# Hospital Names and Locations: https://data.gov.uk/dataset/f4420d1c-043a-42bc-afbc-4c0f7d3f1620/hospitals
hosp_data_dir = os.path.join(data_dir, "hospital-data")
eng_hosp_file = "Hospital.csv"
wales_hosp_file = "Wales hospital list.xls"

# LSOA 2011 Pop Weighted Centroids: https://geoportal.statistics.gov.uk/datasets/lower-layer-super-output-areas-december-2011-population-weighted-centroids
lsoa_pop_weighted_centroids_file = "Lower_Layer_Super_Output_Areas_December_2011_Population_Weighted_Centroids.csv"

# Wards 2019 boundaries
ward_boundaries = "Wards_December_2019_Boundaries_UK_BGC.shp"

# LSOA 2011 Boundaries
lsoa_boundaries = "Lower_Layer_Super_Output_Areas_December_2011_Boundaries_EW_BGC.shp"

# Island wards to exclude - Isle of Scily and Wight wards
island_ward_boundaries = "Wards_December_2019_Boundaries_Scily_Wight_BGC.shp"


##########################
#
# Load separate data sources for Engalnd and Wales Hospitals
#
# These might be more outdated than the data above but the England file is geocode and only contains hospitals and the Welsh one
# contains info on type of hosptal (eg major/community/etc)
# 
# Can use these to filter out just the hospitals I am interested in from the data above
#
# England hospitals: https://data.gov.uk/dataset/f4420d1c-043a-42bc-afbc-4c0f7d3f1620/hospitals
# Wales hospitals: https://www.whatdotheyknow.com/request/list_of_hospitals_3?unfold=1
#
##########################

dfEngHosp = pd.read_csv(os.path.join(hosp_data_dir, eng_hosp_file), delimiter="¬")
dfWlsHosp = pd.read_excel(os.path.join(hosp_data_dir, wales_hosp_file))


########################
#
# Filter hospitals by type
#
# Want to remove entries that are unlikely to provide support to
# patients suffering from COVID-19.
#
# This approximate because 1) the data doesn't contain hospital classification for all entries and
# 2) I do not have a good understanding of what type of hospital is able to provide care to COVID-19 patients.
#
# This page gives some information on Welsh hospital classifications: http://www.datadictionary.wales.nhs.uk/index.html#!WordDocuments/hospitalclassificationcategories.htm
#
#######################

# For the English hospitals, only three categoires given: 'Hospital', 'UNKNOWN', 'Mental Health Hospital'
# Filter out 'Mental Health Hospital'
dfEngHosp = dfEngHosp.loc[ dfEngHosp['SubType'] != 'Mental Health Hospital']

# Remove closed Wales hospitals
dfWlsHosp = dfWlsHosp.loc[ dfWlsHosp['Type'] != 'Closed']

# For the Welsh Hospitals there are many hospital types given. Select the following, based on http://www.datadictionary.wales.nhs.uk/index.html#!WordDocuments/hospitalclassificationcategories.htm
dfWlsHosp = dfWlsHosp.loc[ dfWlsHosp['Type'].isin([	'Major A&E Units',
															'Community',
															'Acute',
															'Major acute',
															'Minor A&E Units'])]


##########################
#
# ONSPD Feb 2020
#
# Used to geocode postcodes in the Wales hospital data: https://geoportal.statistics.gov.uk/search?collection=Dataset&sort=name&tags=all(PRD_ONSPD%2CFEB_2020)
#
##########################
onspd_dir = os.path.join(data_dir, "postcodes\\Data\\")

onspd_chunks = pd.read_csv(os.path.join(onspd_dir, "ONSPD_FEB_2020_UK.csv"), chunksize=10000)


##########################
#
# Gather postcodes to to geocode
#
# All the postcodes in the NHS Trust Sites data and the postcodes in the Wales hospitals data
# 
# There should be overlapping postcodes in these.
#
##########################

def clean_pcd(pcd_string):

	# Remove all spaces
	while " " in pcd_string:
		pcd_string = pcd_string.replace(" ","")
	
	pcd_string = pcd_string.upper()
	return pcd_string

# Check there are no missing postcodes
assert dfWlsHosp['Postcode'].isnull().any() == False

dfWlsHosp['postcode_clean'] = dfWlsHosp['Postcode'].map(lambda s: clean_pcd(s))

pcds_to_geocode = dfWlsHosp['postcode_clean'].unique()

dfONSPDFiltered = pd.DataFrame()

for dfONSPD in onspd_chunks:

	# Clean postcode
	dfONSPD['pcds_clean'] = dfONSPD['pcds'].map(lambda s: clean_pcd(s))

	# Filter just the ones we want
	dfONSPD = dfONSPD.loc[ dfONSPD['pcds_clean'].isin(pcds_to_geocode), ['pcds', 'pcds_clean','lat','long']]

	dfONSPDFiltered = pd.concat([dfONSPDFiltered, dfONSPD])


# Check that all postcodes have been found
assert pcds_to_geocode.shape[0] == dfONSPDFiltered.shape[0]

# Geocode wales hospitals
dfWlsHosp = pd.merge(dfWlsHosp, dfONSPDFiltered, left_on = 'postcode_clean', right_on = 'pcds_clean', how = 'left')

assert dfWlsHosp['lat'].isnull().any() == False
assert dfWlsHosp['long'].isnull().any() == False


# Create geo data frames

gdfWlsHosp = gpd.GeoDataFrame(dfWlsHosp, geometry = gpd.points_from_xy(dfWlsHosp['long'], dfWlsHosp['lat']), crs = "EPSG:4326")
gdfWlsHosp = gdfWlsHosp.to_crs(GBcrs)

gdfEngHosp = gpd.GeoDataFrame(dfEngHosp, geometry=gpd.points_from_xy(dfEngHosp.Longitude, dfEngHosp.Latitude), crs = "EPSG:4326")
gdfEngHosp = gdfEngHosp.to_crs(GBcrs)
gdfEngHosp.dropna(subset = ['Latitude','Longitude'], inplace = True)


##################################
#
# Combine Welsh and English hospitals into a single hospitals dataframe
#
##################################

gdfWlsHosp = gdfWlsHosp.rename(columns = {"Hospital":"OrganisationName", 'lat':'Latitude', 'long':'Longitude', 'Type':'SubType'})
gdfWlsHosp['OrganisationCode'] = None # Don't have this info for the Welsh hospitals

gdfEngHosp['postcode_clean'] = gdfEngHosp['Postcode'].map(lambda s: clean_pcd(s))

gdfHospitals = pd.concat([gdfEngHosp, gdfWlsHosp], join = 'inner')

# Create unique index that works for both english and welsh hospitals (since welsh hospitals data doesn't include organisation codes)
gdfHospitals["NamePCD"] = gdfHospitals['OrganisationName'].str.replace(" ","_") + "_" + gdfHospitals['postcode_clean']

# There are two two pairs of hospitals with the same name and same postcode, treat these as duplicates
gdfHospitals.drop_duplicates("NamePCD", inplace = True)

# Save this
gdfHospitals.to_file(os.path.join(hosp_data_dir, "England_and_Wales_Hospitals.shp"))

##############
#
# Read and clean data
#
##############

##
## Wards
##
# Load ward boundaries and get ward centroids
gdfWards = gpd.read_file(os.path.join(data_dir, "boundaries", ward_boundaries))
gdfWards = gdfWards.to_crs(GBcrs)

# Load island wards - selected manually using QGIS
gdfWardsIsland = gpd.read_file(os.path.join(data_dir, "boundaries", island_ward_boundaries))
gdfWardsIsland = gdfWardsIsland.to_crs(GBcrs)

# Only work with England and Wales wards for now
gdfWards = gdfWards.loc[ (gdfWards['WD19CD'].map(lambda s: s[0] =='E')) | (gdfWards['WD19CD'].map(lambda s: s[0] =='W'))]

# Exclude Isles of Scily and Isle of Whight for now - need to add in ferry crossings to connect with road network
gdfWards = gdfWards.loc[~gdfWards['WD19CD'].isin(gdfWardsIsland['WD19CD'])]

gdfWardsCent = gdfWards.copy()
gdfWardsCent['geometry'] = gpd.points_from_xy(gdfWardsCent.LONG, gdfWardsCent.LAT)
gdfWardsCent.crs = {"init":'epsg:4326'}
gdfWardsCent = gdfWardsCent.to_crs(GBcrs)
gdfWardsCent.to_file(os.path.join(data_dir, "boundaries", "ward_centroids.shp"))

##
## LSOAs
##
gdfLSOAs = gpd.read_file(os.path.join(data_dir, "boundaries", lsoa_boundaries))
gdfLSOAs = gdfLSOAs.to_crs(GBcrs)


dfLSOACent = pd.read_csv(os.path.join(data_dir, lsoa_pop_weighted_centroids_file))
gdfLSOACent = gpd.GeoDataFrame(dfLSOACent, geometry = gpd.points_from_xy(dfLSOACent.X, dfLSOACent.Y))
gdfLSOACent.crs = GBcrs

# Filter out centroids on Isle of Whilte and Isles of Scily
lsoa_cols = gdfLSOACent.columns
gdfLSOACent = gpd.overlay(gdfLSOACent, gdfWardsIsland, how = 'difference')
gdfLSOACent = gdfLSOACent.reindex(columns = lsoa_cols)


##
## Open Roads Data
##

# commented out because once data files have been combined and saved as a single file, this section
# of code doesn't need to be executed


# Combine mutiple OS Road Nodes files into a single geodataframe
road_node_files = [i for i in os.listdir(open_roads_dir) if "RoadNode.shp" in i]
gdfRoadNodes = gpd.GeoDataFrame()
for f in road_node_files:
    gdf = gpd.read_file(os.path.join(open_roads_dir, f))
    gdf['file'] = f
    gdfRoadNodes = pd.concat([gdfRoadNodes, gdf])
gdfRoadNodes = gdfRoadNodes.to_crs(GBcrs)
gdfRoadNodes.to_file(os.path.join(data_dir, "OS Open Road Nodes.shp"))


road_link_files = [i for i in os.listdir(open_roads_dir) if "RoadLink.shp" in i]

gdfRoadLinks = gpd.GeoDataFrame()
for f in road_link_files:
    gdf = gpd.read_file(os.path.join(open_roads_dir, f))
    gdfRoadLinks = pd.concat([gdfRoadLinks, gdf])
gdfRoadLinks = gdfRoadLinks.to_crs(GBcrs)
gdfRoadLinks.to_file(os.path.join(data_dir,"OS Open Road Links.shp"))


########################
#
# Use road network to find distances to nearest hospitals
#
########################

import networkx as nx
import pandana as pdna

# Extract the road network topology from the gis data
print("Loading network topology")

# commented out because once network topology has been extracted from open road gis data this section doesn't need tp be executed

gdfRoadNodes = gpd.read_file(os.path.join(data_dir, "OS Open Road Nodes.shp"))
gdfRoadLinks = gpd.read_file(os.path.join(data_dir,"OS Open Road Links.shp"))

dfNodes = pd.DataFrame({"identifier":gdfRoadNodes.identifier, "x": gdfRoadNodes.geometry.x, "y":gdfRoadNodes.geometry.y})
dfNodes.set_index("identifier", inplace=True)
dfLinks = gdfRoadLinks.loc[:, ['startNode', 'endNode', 'length']]
dfNodes.to_csv(os.path.join(data_dir, "OS Open Road Nodes.csv"))
dfLinks.to_csv(os.path.join(data_dir, "OS Open Road Links.csv"))
gdfRoadNodes = None
gdfRoadLinks = None

dfNodes = pd.read_csv(os.path.join(data_dir, "OS Open Road Nodes.csv"))
dfLinks = pd.read_csv(os.path.join(data_dir, "OS Open Road Links.csv"))

dfNodes.drop_duplicates(inplace=True)
dfNodes.set_index('identifier', inplace=True)

gdfHospitals.set_index('NamePCD', inplace = True)

# Get largest connected component
print("Finding largest connected component")
edges = dfLinks.loc[:,['startNode','endNode','length']].values
G = nx.Graph()
G.add_weighted_edges_from(edges)
largest_connected_component = sorted(nx.connected_components(G), key = len, reverse=True)[0]

# Clean up to save memory
G = None
edges = None

# Create pandana network. It's much faster for nearest point-of-interest analysis
print("Creating Network")

# Filter nodes and edges to just include those in the largest connected componet
dfLinksLCC = dfLinks.loc[(dfLinks['startNode'].isin(largest_connected_component)) & (dfLinks['endNode'].isin(largest_connected_component))]
dfNodesLCC = dfNodes.loc[largest_connected_component]

net=pdna.Network(dfNodesLCC["x"], dfNodesLCC["y"], dfLinksLCC["startNode"], dfLinksLCC["endNode"], dfLinksLCC[["length"]])
#net.save_hdf5(os.path.join(data_dir, "pandana_network.h5"))

# Can then get the nearest 3 hospitals to each node in the network
print("Finding nearest hospitals")
search_distance = 200000
net.set_pois("hospitals", search_distance, 3, gdfHospitals.geometry.x, gdfHospitals.geometry.y)
dfNear = net.nearest_pois(search_distance,"hospitals", num_pois=3, include_poi_ids=True)


######################################
#
#
# Get distance to hospitals from Ward centroids
#
# Filter dfNear to include only the road nodes that are nearest to ward centroids
#
######################################

# Select only the Wards centroids and their nearest hospitals rather than every road node
ward_road_nodes = net.get_node_ids(gdfWardsCent.geometry.x, gdfWardsCent.geometry.y)
dfNearestHospitals = dfNear.loc[ward_road_nodes]

# Include Ward codes
gdfWardsCent['ward_road_node'] = ward_road_nodes

dfNearestHospitals.reset_index(inplace=True)
dfNearestHospitals = pd.merge(dfNearestHospitals, gdfWardsCent, left_on = 'identifier', right_on = 'ward_road_node', how = 'outer', indicator = True)
assert gdfWardsCent.shape[0] == dfNearestHospitals.shape[0]

# Check each ward Centroid has nearest 3 hospitals and
print("Checking hospitals found - wards")
assert dfNearestHospitals['_merge'].value_counts()['left_only'] == 0
assert dfNearestHospitals['_merge'].value_counts()['right_only'] == 0
assert dfNearestHospitals['poi1'].isnull().any() == False
assert dfNearestHospitals['poi2'].isnull().any() == False
assert dfNearestHospitals['poi3'].isnull().any() == False


########################################
#
#
# Get distance to hospitals from LSOA Centroids
#
# Repeate steps above but for LSOA centroids
#
########################################

# Get just the LSOA centroids and their nearest hospitals
lsoa_road_nodes = net.get_node_ids(gdfLSOACent.X, gdfLSOACent.Y)
dfNearLSOACentroids = dfNear.loc[lsoa_road_nodes]

# Include LSOA codes
gdfLSOACent['lsoa_road_node'] = lsoa_road_nodes
dfNearLSOACentroids.reset_index(inplace=True)
dfNearLSOACentroids = pd.merge(dfNearLSOACentroids, gdfLSOACent, left_on = 'identifier', right_on = 'lsoa_road_node', how = 'outer', indicator = True)

#assert gdfLSOACent.shape[0] == dfNearLSOACentroids.shape[0] This test fails for LSOAs. There are 3 pairs of LSOA centroids that are v close and share the same nearest road node

# Need to drop duplicate entries that get created from mergeing with duplciated road nodes
dfNearLSOACentroids.drop_duplicates(inplace=True)

# Check each ward Centroid has nearest 3 hospitals and
print("Checking hospitals found - lsoas")
assert dfNearLSOACentroids['_merge'].value_counts()['left_only'] == 0
assert dfNearLSOACentroids['_merge'].value_counts()['right_only'] == 0
assert dfNearLSOACentroids['poi1'].isnull().any() == False
assert dfNearLSOACentroids['poi2'].isnull().any() == False
assert dfNearLSOACentroids['poi3'].isnull().any() == False

#########################
#
# Clearing up and making output more presentable
#
#########################

##
## Wards
##

# Compute average distance to nearest 3 hospitals
print("Formatting and saving wards data")
dfNearestHospitals['mean_distance_nearest_three_hospitals'] = dfNearestHospitals.loc[:,[1,2,3]].mean(axis = 1)

dfNearestHospitals = dfNearestHospitals.reindex(columns = [   'WD19CD','WD19NM','ward_road_node',
                                                                1,2,3,'poi1','poi2','poi3',
                                                                'mean_distance_nearest_three_hospitals'])

dfNearestHospitals = dfNearestHospitals.rename(columns = {1:'distance1',2:'distance2',3:'distance3',
                                                            'poi1':'hospital_postcode1','poi2':'hospital_postcode2','poi3':'hospital_postcode3',
                                                            'ward_road_node':'ward_centroid_road_node'})

dfNearestHospitals.to_csv(os.path.join(data_dir, "england-wales-nearest-hospitals-ward.csv"), index=False)


##
## LSOAs
##
print("Formatting and saving LSOAs data")
dfNearLSOACentroids['mean_distance_nearest_three_hospitals'] = dfNearLSOACentroids.loc[:,[1,2,3]].mean(axis = 1)

dfNearLSOACentroids = dfNearLSOACentroids.reindex(columns = [   'lsoa11cd','lsoa11nm','lsoa_road_node',
                                                                1,2,3,'poi1','poi2','poi3',
                                                                'mean_distance_nearest_three_hospitals'])

dfNearLSOACentroids = dfNearLSOACentroids.rename(columns = {1:'distance1',2:'distance2',3:'distance3',
                                                            'poi1':'hospital_postcode1','poi2':'hospital_postcode2','poi3':'hospital_postcode3',
                                                            'lsoa_road_node':'lsoa_centroid_road_node'})

dfNearLSOACentroids.to_csv(os.path.join(data_dir, "england-wales-nearest-hospitals-lsoa.csv"), index=False)


####################
#
# Make a map
#
###################
print("Creating ward map")
from matplotlib import pyplot as plt
gdfWards = gdfWards.merge(dfNearestHospitals, left_on = 'WD19CD',right_on = 'WD19CD', how = 'inner')

f, ax = plt.subplots(1,1, figsize = (30,30))
legend_info ={'label': "Mean distance to nearest 3 hospitals (m)"}
gdfWards.plot(column = 'mean_distance_nearest_three_hospitals', ax = ax, legend = True, legend_kwds = legend_info, linewidth=0)
plt.axis('off')
f.savefig(os.path.join(data_dir, 'england-wales-nearest-hospitals-ward.png'))


print("Creating lsoa map")
gdfLSOAs = gdfLSOAs.merge(dfNearLSOACentroids, left_on = 'LSOA11CD',right_on = 'lsoa11cd', how = 'inner')

f, ax = plt.subplots(1,1, figsize = (30,30))
legend_info ={'label': "Mean distance to nearest 3 hospitals (m)"}
gdfLSOAs.plot(column = 'mean_distance_nearest_three_hospitals', ax = ax, legend = True, legend_kwds = legend_info, linewidth=0)
plt.axis('off')
f.savefig(os.path.join(data_dir, 'england-wales-nearest-hospitals-lsoa.png'))